Fix integer overflow in CSV parser causing segfault

ssam18 · ssam18 · commit 4bc51beacc4e · 2025-11-12T12:08:30.000-06:00
Fixes #63089 When parsing scientific notation in CSV files, extremely large exponent values (e.g., '4e492493924924') caused integer overflow in the exponent accumulation loop, leading to undefined behavior and segmentation faults. The issue occurred in xstrtod() at pandas/_libs/src/parser/tokenizer.c where exponent digits were accumulated without bounds checking: int n = 0; while (isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); // Overflow here with large exponents ... } Solution: - Add a maximum exponent digits cap (MAX_EXPONENT_DIGITS = 4) to prevent overflow while still allowing valid scientific notation - Continue consuming remaining digits to maintain correct parsing position - The capped value (up to 9999) is sufficient since the subsequent range check (DBL_MIN_EXP to DBL_MAX_EXP) will catch invalid exponents This fix prevents the overflow while maintaining correct parsing behavior for both valid and invalid exponent values. Signed-off-by: Samaresh Kumar Singh <ssam3003@gmail.com>
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -1510,8 +1510,14 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
     // Process string of digits.
     num_digits = 0;
     int n = 0;
+    // Prevent integer overflow by capping exponent value
+    // DBL_MAX_EXP is typically 1024, so we use a safe upper bound
+    const int MAX_EXPONENT_DIGITS = 4;  // Allows up to 9999
     while (isdigit_ascii(*p)) {
-      n = n * 10 + (*p - '0');
+      if (num_digits < MAX_EXPONENT_DIGITS) {
+        n = n * 10 + (*p - '0');
+      }
+      // Continue consuming digits even after cap to maintain correct parsing position
       num_digits++;
       p++;
     }
diff --git a/pandas/tests/io/parser/test_issue_63089.py b/pandas/tests/io/parser/test_issue_63089.py
@@ -0,0 +1,37 @@
+"""
+Test for issue #63089 - read_csv segfault with large exponent
+"""
+import io
+import pandas as pd
+import pytest
+
+
+class TestIssue63089:
+    def test_large_exponent_no_segfault(self):
+        """Test that extremely large exponents don't cause segfault."""
+        # This previously caused SIGSEGV due to integer overflow
+        # when parsing the exponent
+        result = pd.read_csv(io.StringIO("""h
+4e492493924924"""))
+        
+        # Should parse as infinity or large float, not crash
+        assert len(result) == 1
+        assert 'h' in result.columns
+        # The value should be infinity since the exponent is way too large
+        import numpy as np
+        assert np.isinf(result['h'].iloc[0]) or result['h'].iloc[0] > 1e308
+    
+    def test_various_large_exponents(self):
+        """Test various edge cases with large exponents."""
+        test_cases = [
+            "1e999999999",  # Very large positive exponent
+            "1e-999999999",  # Very large negative exponent  
+            "2.5e123456789",  # Large exponent with decimal
+        ]
+        
+        for test_val in test_cases:
+            csv_data = f"col\n{test_val}"
+            result = pd.read_csv(io.StringIO(csv_data))
+            # Should not crash, result should be inf, 0, or valid float
+            assert len(result) == 1
+            assert not pd.isna(result['col'].iloc[0]) or True  # Just don't crash
diff --git a/reproduce_issue_63089.py b/reproduce_issue_63089.py
@@ -0,0 +1,11 @@
+import io
+import pandas as pd
+
+print("Testing issue #63089...")
+try:
+    result = pd.read_csv(io.StringIO("""h
+4e492493924924"""))
+    print("Success! Result:")
+    print(result)
+except Exception as e:
+    print(f"Exception occurred: {type(e).__name__}: {e}")