From 4bc51beacc4ea4e53d4c3067d40f54de8813f075 Mon Sep 17 00:00:00 2001
From: Samaresh Kumar Singh <ssam3003@gmail.com>
Date: Wed, 12 Nov 2025 12:08:22 -0600
Subject: [PATCH 1/2] Fix integer overflow in CSV parser causing segfault

Fixes #63089

When parsing scientific notation in CSV files, extremely large exponent
values (e.g., '4e492493924924') caused integer overflow in the exponent
accumulation loop, leading to undefined behavior and segmentation faults.

The issue occurred in xstrtod() at pandas/_libs/src/parser/tokenizer.c
where exponent digits were accumulated without bounds checking:

    int n = 0;
    while (isdigit_ascii(*p)) {
        n = n * 10 + (*p - '0');  // Overflow here with large exponents
        ...
    }

Solution:
- Add a maximum exponent digits cap (MAX_EXPONENT_DIGITS = 4) to prevent
  overflow while still allowing valid scientific notation
- Continue consuming remaining digits to maintain correct parsing position
- The capped value (up to 9999) is sufficient since the subsequent range
  check (DBL_MIN_EXP to DBL_MAX_EXP) will catch invalid exponents

This fix prevents the overflow while maintaining correct parsing behavior
for both valid and invalid exponent values.

Signed-off-by: Samaresh Kumar Singh <ssam3003@gmail.com>
---
 pandas/_libs/src/parser/tokenizer.c        |  8 ++++-
 pandas/tests/io/parser/test_issue_63089.py | 37 ++++++++++++++++++++++
 reproduce_issue_63089.py                   | 11 +++++++
 3 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 pandas/tests/io/parser/test_issue_63089.py
 create mode 100644 reproduce_issue_63089.py

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 8d8691ada1d38..1a7bffbe12990 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1510,8 +1510,14 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
     // Process string of digits.
     num_digits = 0;
     int n = 0;
+    // Prevent integer overflow by capping exponent value
+    // DBL_MAX_EXP is typically 1024, so we use a safe upper bound
+    const int MAX_EXPONENT_DIGITS = 4;  // Allows up to 9999
     while (isdigit_ascii(*p)) {
-      n = n * 10 + (*p - '0');
+      if (num_digits < MAX_EXPONENT_DIGITS) {
+        n = n * 10 + (*p - '0');
+      }
+      // Continue consuming digits even after cap to maintain correct parsing position
       num_digits++;
       p++;
     }
diff --git a/pandas/tests/io/parser/test_issue_63089.py b/pandas/tests/io/parser/test_issue_63089.py
new file mode 100644
index 0000000000000..9eeafcf8414e4
--- /dev/null
+++ b/pandas/tests/io/parser/test_issue_63089.py
@@ -0,0 +1,37 @@
+"""
+Test for issue #63089 - read_csv segfault with large exponent
+"""
+import io
+import pandas as pd
+import pytest
+
+
+class TestIssue63089:
+    def test_large_exponent_no_segfault(self):
+        """Test that extremely large exponents don't cause segfault."""
+        # This previously caused SIGSEGV due to integer overflow
+        # when parsing the exponent
+        result = pd.read_csv(io.StringIO("""h
+4e492493924924"""))
+        
+        # Should parse as infinity or large float, not crash
+        assert len(result) == 1
+        assert 'h' in result.columns
+        # The value should be infinity since the exponent is way too large
+        import numpy as np
+        assert np.isinf(result['h'].iloc[0]) or result['h'].iloc[0] > 1e308
+    
+    def test_various_large_exponents(self):
+        """Test various edge cases with large exponents."""
+        test_cases = [
+            "1e999999999",  # Very large positive exponent
+            "1e-999999999",  # Very large negative exponent  
+            "2.5e123456789",  # Large exponent with decimal
+        ]
+        
+        for test_val in test_cases:
+            csv_data = f"col\n{test_val}"
+            result = pd.read_csv(io.StringIO(csv_data))
+            # Should not crash, result should be inf, 0, or valid float
+            assert len(result) == 1
+            assert not pd.isna(result['col'].iloc[0]) or True  # Just don't crash
diff --git a/reproduce_issue_63089.py b/reproduce_issue_63089.py
new file mode 100644
index 0000000000000..ed3aa60f74492
--- /dev/null
+++ b/reproduce_issue_63089.py
@@ -0,0 +1,11 @@
+import io
+import pandas as pd
+
+print("Testing issue #63089...")
+try:
+    result = pd.read_csv(io.StringIO("""h
+4e492493924924"""))
+    print("Success! Result:")
+    print(result)
+except Exception as e:
+    print(f"Exception occurred: {type(e).__name__}: {e}")

From 643d12c642a7155246885c3eb734774900586ef8 Mon Sep 17 00:00:00 2001
From: Samaresh Kumar Singh <ssam3003@gmail.com>
Date: Wed, 12 Nov 2025 12:13:37 -0600
Subject: [PATCH 2/2] Apply code formatting (ruff, isort, clang-format)

---
 pandas/_libs/src/parser/tokenizer.c        |  5 +++--
 pandas/tests/io/parser/test_issue_63089.py | 24 +++++++++++++---------
 reproduce_issue_63089.py                   |  7 +++++--
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 1a7bffbe12990..7a25c519ea14e 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1512,12 +1512,13 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
     int n = 0;
     // Prevent integer overflow by capping exponent value
     // DBL_MAX_EXP is typically 1024, so we use a safe upper bound
-    const int MAX_EXPONENT_DIGITS = 4;  // Allows up to 9999
+    const int MAX_EXPONENT_DIGITS = 4; // Allows up to 9999
     while (isdigit_ascii(*p)) {
       if (num_digits < MAX_EXPONENT_DIGITS) {
         n = n * 10 + (*p - '0');
       }
-      // Continue consuming digits even after cap to maintain correct parsing position
+      // Continue consuming digits even after cap to maintain correct parsing
+      // position
       num_digits++;
       p++;
     }
diff --git a/pandas/tests/io/parser/test_issue_63089.py b/pandas/tests/io/parser/test_issue_63089.py
index 9eeafcf8414e4..b14f72b5b91ca 100644
--- a/pandas/tests/io/parser/test_issue_63089.py
+++ b/pandas/tests/io/parser/test_issue_63089.py
@@ -1,9 +1,10 @@
 """
 Test for issue #63089 - read_csv segfault with large exponent
 """
+
 import io
+
 import pandas as pd
-import pytest
 
 
 class TestIssue63089:
@@ -11,27 +12,30 @@ def test_large_exponent_no_segfault(self):
         """Test that extremely large exponents don't cause segfault."""
         # This previously caused SIGSEGV due to integer overflow
         # when parsing the exponent
-        result = pd.read_csv(io.StringIO("""h
-4e492493924924"""))
-        
+        result = pd.read_csv(
+            io.StringIO("""h
+4e492493924924""")
+        )
+
         # Should parse as infinity or large float, not crash
         assert len(result) == 1
-        assert 'h' in result.columns
+        assert "h" in result.columns
         # The value should be infinity since the exponent is way too large
         import numpy as np
-        assert np.isinf(result['h'].iloc[0]) or result['h'].iloc[0] > 1e308
-    
+
+        assert np.isinf(result["h"].iloc[0]) or result["h"].iloc[0] > 1e308
+
     def test_various_large_exponents(self):
         """Test various edge cases with large exponents."""
         test_cases = [
             "1e999999999",  # Very large positive exponent
-            "1e-999999999",  # Very large negative exponent  
+            "1e-999999999",  # Very large negative exponent
             "2.5e123456789",  # Large exponent with decimal
         ]
-        
+
         for test_val in test_cases:
             csv_data = f"col\n{test_val}"
             result = pd.read_csv(io.StringIO(csv_data))
             # Should not crash, result should be inf, 0, or valid float
             assert len(result) == 1
-            assert not pd.isna(result['col'].iloc[0]) or True  # Just don't crash
+            assert not pd.isna(result["col"].iloc[0]) or True  # Just don't crash
diff --git a/reproduce_issue_63089.py b/reproduce_issue_63089.py
index ed3aa60f74492..8d98926eefdef 100644
--- a/reproduce_issue_63089.py
+++ b/reproduce_issue_63089.py
@@ -1,10 +1,13 @@
 import io
+
 import pandas as pd
 
 print("Testing issue #63089...")
 try:
-    result = pd.read_csv(io.StringIO("""h
-4e492493924924"""))
+    result = pd.read_csv(
+        io.StringIO("""h
+4e492493924924""")
+    )
     print("Success! Result:")
     print(result)
 except Exception as e: