Skip to content

Commit 9de4b2a

Browse files
committed
refactor: Consolidate _str_copy_decimal_str_c and copy_string_without_char
1 parent 05504cf commit 9de4b2a

File tree

1 file changed

+41
-116
lines changed

1 file changed

+41
-116
lines changed

pandas/_libs/src/parser/tokenizer.c

Lines changed: 41 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -1800,22 +1800,28 @@ static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src,
18001800
const char *s = src;
18011801
char *d = dst;
18021802
const char *de = dst + dst_sz;
1803+
bool seen_digit = false;
18031804
int ret;
18041805

18051806
if (endpos != NULL)
18061807
*endpos = (char *)s;
18071808

1808-
// Skip leading whitespace.
1809+
// Skip leading whitespace
18091810
SKIP_SPAN(s, whitespaces);
18101811

18111812
// Copy leading sign (optional)
18121813
SAFE_CONSUME_NSPAN(d, de, s, 1, signs);
18131814

1815+
// Check that there is a first digit or decimal point.
1816+
if (!isdigit_ascii(*s) && *s != decimal)
1817+
return ERROR_NO_DIGITS;
1818+
18141819
// Copy integer part dropping `tsep`
18151820
while ((ret = str_consume_span(&d, de - d, &s, digits))) {
18161821
if (ret < 0)
18171822
return ERROR_OVERFLOW;
1818-
SKIP_NSPAN(s, 1, tseps);
1823+
seen_digit = true;
1824+
SKIP_SPAN(s, tseps);
18191825
}
18201826

18211827
// Replace `decimal` with '.'
@@ -1825,7 +1831,16 @@ static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src,
18251831
}
18261832

18271833
// Copy fractional part after decimal (if any)
1828-
SAFE_CONSUME_SPAN(d, de, s, digits);
1834+
if ((ret = str_consume_span(&d, de - d, &s, digits)) > 0) {
1835+
seen_digit = true;
1836+
} else if (ret < 0) {
1837+
return ERROR_OVERFLOW;
1838+
}
1839+
1840+
if (!seen_digit) {
1841+
// No digits found in integer or fractional part
1842+
return ERROR_NO_DIGITS;
1843+
}
18291844

18301845
// Copy exponent if any
18311846
if ((ret = str_consume_nspan(&d, de - d, &s, 1, exponents)) > 0) {
@@ -1835,10 +1850,17 @@ static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src,
18351850
return ERROR_OVERFLOW;
18361851
}
18371852

1853+
// Skip trailing whitespace
1854+
SKIP_SPAN(s, whitespaces);
1855+
18381856
// Terminate string
18391857
CHECK_BUFFER_SPACE(d, de);
18401858
*d++ = '\0';
18411859

1860+
// Did we use up all the characters?
1861+
if (*s)
1862+
return ERROR_INVALID_CHARS;
1863+
18421864
if (endpos != NULL)
18431865
*endpos = (char *)s;
18441866

@@ -1903,150 +1925,53 @@ int uint64_conflict(uint_state *self) {
19031925
return self->seen_uint && (self->seen_sint || self->seen_null);
19041926
}
19051927

1906-
/* Copy a string without `char_to_remove` into `output`.
1907-
*/
1908-
static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
1909-
const char *str, size_t str_len,
1910-
char char_to_remove) {
1911-
const char *left = str;
1912-
const char *end_ptr = str + str_len;
1913-
size_t bytes_written = 0;
1914-
1915-
while (left < end_ptr) {
1916-
const size_t remaining_bytes_to_read = end_ptr - left;
1917-
const char *right = memchr(left, char_to_remove, remaining_bytes_to_read);
1918-
1919-
if (!right) {
1920-
// If it doesn't find the char to remove, just copy until EOS.
1921-
right = end_ptr;
1922-
}
1923-
1924-
const size_t chunk_size = right - left;
1925-
1926-
if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) {
1927-
return -1;
1928-
}
1929-
memcpy(&output[bytes_written], left, chunk_size);
1930-
bytes_written += chunk_size;
1931-
1932-
left = right + 1;
1933-
}
1934-
1935-
output[bytes_written] = '\0';
1936-
return 0;
1937-
}
1938-
19391928
int64_t str_to_int64(const char *p_item, int *error, char tsep) {
1940-
const char *p = p_item;
1941-
// Skip leading spaces.
1942-
while (isspace_ascii(*p)) {
1943-
++p;
1944-
}
1945-
1946-
// Handle sign.
1947-
const bool has_sign = *p == '-' || *p == '+';
1948-
// Handle sign.
1949-
const char *digit_start = has_sign ? p + 1 : p;
1950-
1951-
// Check that there is a first digit.
1952-
if (!isdigit_ascii(*digit_start)) {
1953-
// Error...
1954-
*error = ERROR_NO_DIGITS;
1955-
return 0;
1956-
}
1957-
19581929
char buffer[PROCESSED_WORD_CAPACITY];
1959-
const size_t str_len = strlen(p);
1960-
if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
1961-
const int status = copy_string_without_char(buffer, p, str_len, tsep);
1962-
if (status != 0) {
1963-
// Word is too big, probably will cause an overflow
1964-
*error = ERROR_OVERFLOW;
1965-
return 0;
1966-
}
1967-
p = buffer;
1930+
char *endptr;
1931+
int status = _str_copy_decimal_str_c(buffer, PROCESSED_WORD_CAPACITY, p_item,
1932+
&endptr, '\0', tsep);
1933+
if (status != 0) {
1934+
*error = status;
1935+
return 0;
19681936
}
19691937

1970-
char *endptr;
1971-
int64_t number = strtoll(p, &endptr, 10);
1938+
int64_t number = strtoll(buffer, &endptr, 10);
19721939

19731940
if (errno == ERANGE) {
19741941
*error = ERROR_OVERFLOW;
19751942
errno = 0;
19761943
return 0;
19771944
}
19781945

1979-
// Skip trailing spaces.
1980-
while (isspace_ascii(*endptr)) {
1981-
++endptr;
1982-
}
1983-
1984-
// Did we use up all the characters?
1985-
if (*endptr) {
1986-
*error = ERROR_INVALID_CHARS;
1987-
return 0;
1988-
}
1989-
19901946
*error = 0;
19911947
return number;
19921948
}
19931949

19941950
uint64_t str_to_uint64(uint_state *state, const char *p_item, int *error,
19951951
char tsep) {
1996-
const char *p = p_item;
1997-
// Skip leading spaces.
1998-
while (isspace_ascii(*p)) {
1999-
++p;
1952+
char buffer[PROCESSED_WORD_CAPACITY];
1953+
char *endptr;
1954+
int status = _str_copy_decimal_str_c(buffer, PROCESSED_WORD_CAPACITY, p_item,
1955+
&endptr, '\0', tsep);
1956+
if (status != 0) {
1957+
*error = status;
1958+
return 0;
20001959
}
20011960

2002-
// Handle sign.
2003-
if (*p == '-') {
1961+
if (buffer[0] == '-') {
20041962
state->seen_sint = 1;
20051963
*error = 0;
20061964
return 0;
2007-
} else if (*p == '+') {
2008-
p++;
20091965
}
20101966

2011-
// Check that there is a first digit.
2012-
if (!isdigit_ascii(*p)) {
2013-
// Error...
2014-
*error = ERROR_NO_DIGITS;
2015-
return 0;
2016-
}
2017-
2018-
char buffer[PROCESSED_WORD_CAPACITY];
2019-
const size_t str_len = strlen(p);
2020-
if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
2021-
const int status = copy_string_without_char(buffer, p, str_len, tsep);
2022-
if (status != 0) {
2023-
// Word is too big, probably will cause an overflow
2024-
*error = ERROR_OVERFLOW;
2025-
return 0;
2026-
}
2027-
p = buffer;
2028-
}
2029-
2030-
char *endptr;
2031-
uint64_t number = strtoull(p, &endptr, 10);
1967+
uint64_t number = strtoull(buffer, &endptr, 10);
20321968

20331969
if (errno == ERANGE) {
20341970
*error = ERROR_OVERFLOW;
20351971
errno = 0;
20361972
return 0;
20371973
}
20381974

2039-
// Skip trailing spaces.
2040-
while (isspace_ascii(*endptr)) {
2041-
++endptr;
2042-
}
2043-
2044-
// Did we use up all the characters?
2045-
if (*endptr) {
2046-
*error = ERROR_INVALID_CHARS;
2047-
return 0;
2048-
}
2049-
20501975
if (number > (uint64_t)INT64_MAX) {
20511976
state->seen_uint = 1;
20521977
}

0 commit comments

Comments
 (0)