Skip to content

Commit 5b024e8

Browse files
[-] Try to make the datetime guesser better
1 parent bdc9a7b commit 5b024e8

File tree

3 files changed

+61
-42
lines changed

3 files changed

+61
-42
lines changed

pandas/_libs/tslibs/parsing.pyx

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,7 +1049,6 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
10491049
# rebuild string, capturing any inferred padding
10501050
dt_str = "".join(tokens)
10511051
if parsed_datetime.strftime(guessed_format) == dt_str:
1052-
_maybe_warn_about_dayfirst(guessed_format, dayfirst)
10531052
return guessed_format
10541053
else:
10551054
return None
@@ -1072,29 +1071,6 @@ cdef str _fill_token(token: str, padding: int):
10721071
return token_filled
10731072

10741073

1075-
cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst) noexcept:
1076-
"""Warn if guessed datetime format doesn't respect dayfirst argument."""
1077-
cdef:
1078-
int day_index = format.find("%d")
1079-
int month_index = format.find("%m")
1080-
1081-
if (day_index != -1) and (month_index != -1):
1082-
if (day_index > month_index) and dayfirst:
1083-
warnings.warn(
1084-
f"Parsing dates in {format} format when dayfirst=True was specified. "
1085-
"Pass `dayfirst=False` or specify a format to silence this warning.",
1086-
UserWarning,
1087-
stacklevel=find_stack_level(),
1088-
)
1089-
if (day_index < month_index) and not dayfirst:
1090-
warnings.warn(
1091-
f"Parsing dates in {format} format when dayfirst=False (the default) "
1092-
"was specified. "
1093-
"Pass `dayfirst=True` or specify a format to silence this warning.",
1094-
UserWarning,
1095-
stacklevel=find_stack_level(),
1096-
)
1097-
10981074

10991075
cpdef str get_rule_month(str source):
11001076
"""

pandas/core/tools/datetimes.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22

33
from collections import abc
4-
from datetime import date
4+
from datetime import date, datetime
55
from functools import partial
66
from itertools import islice
77
from typing import (
@@ -131,24 +131,26 @@ class FulldatetimeDict(YearMonthDayDict, total=False):
131131

132132
def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None:
133133
# Try to guess the format based on the first non-NaN element, return None if can't
134-
if (first_non_null := tslib.first_non_null(arr)) != -1:
135-
if type(first_non_nan_element := arr[first_non_null]) is str:
136-
# GH#32264 np.str_ object
137-
guessed_format = guess_datetime_format(
138-
first_non_nan_element, dayfirst=dayfirst
139-
)
134+
search_start = 0
135+
allowed_formats = set()
136+
while not search_start >= len(arr):
137+
non_null_offset = tslib.first_non_null(arr[search_start:])
138+
if non_null_offset == -1:
139+
break
140+
idx = search_start + non_null_offset
141+
element = arr[idx]
142+
if isinstance(element, str):
143+
guessed_format = guess_datetime_format(str(element), dayfirst=dayfirst)
140144
if guessed_format is not None:
141-
return guessed_format
142-
# If there are multiple non-null elements, warn about
143-
# how parsing might not be consistent
144-
if tslib.first_non_null(arr[first_non_null + 1 :]) != -1:
145-
warnings.warn(
146-
"Could not infer format, so each element will be parsed "
147-
"individually, falling back to `dateutil`. To ensure parsing is "
148-
"consistent and as-expected, please specify a format.",
149-
UserWarning,
150-
stacklevel=find_stack_level(),
151-
)
145+
allowed_formats.add(guessed_format)
146+
search_start = idx + 1
147+
# Look through the formats and see if one satisfies each item in the array
148+
for fmt in list(allowed_formats):
149+
try:
150+
[datetime.strptime(date_string, fmt) for date_string in arr if date_string]
151+
return fmt
152+
except ValueError:
153+
pass
152154
return None
153155

154156

pandas/tests/tslibs/test_parsing.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,3 +424,44 @@ def test_parse_datetime_string_with_reso_yearfirst(yearfirst, input):
424424
)
425425
assert except_out_dateutil == except_in_dateutil
426426
assert result[0] == expected
427+
428+
429+
from pandas.core.tools.datetimes import _guess_datetime_format_for_array
430+
431+
432+
@pytest.mark.parametrize(
433+
"expected_format, array",
434+
[
435+
("%d/%m/%Y", np.array(["01/02/2025", "30/07/2025"])),
436+
("%Y-%m-%d", np.array(["2025-08-09", "2025-08-13", None])),
437+
("%m/%d/%Y", np.array(["02/01/2025", "12/31/2025"])),
438+
("%d-%m-%Y", np.array(["01-02-2025", "30-07-2025"])),
439+
("%d.%m.%Y", np.array(["01.02.2025", "30.07.2025"])),
440+
("%Y/%m/%d", np.array(["2025/08/09", "2025/12/01"])),
441+
("%b %d, %Y", np.array(["Feb 01, 2025", "Jul 30, 2025"])),
442+
("%B %d, %Y", np.array(["February 01, 2025", "July 30, 2025"])),
443+
("%d %b %Y", np.array(["01 Feb 2025", "30 Jul 2025"])),
444+
("%d-%b-%Y", np.array(["01-Feb-2025", "30-Jul-2025"])),
445+
("%Y%m%d", np.array(["20250201", "20250730"])),
446+
(None, np.array(["02/01/25", "12/31/25"])),
447+
("%Y-%m-%d %H:%M:%S", np.array(["2025-08-09 14:30:00", "2025-12-01 00:00:00"])),
448+
("%Y-%m-%dT%H:%M:%S", np.array(["2025-08-09T14:30:00", "2025-12-01T00:00:00"])),
449+
(
450+
"%Y-%m-%dT%H:%M:%S.%f",
451+
np.array(["2025-08-09T14:30:00.123456", "2025-12-01T00:00:00.5"]),
452+
),
453+
(
454+
"%Y-%m-%d %H:%M:%S%z",
455+
np.array(["2025-08-09 14:30:00+0000", "2025-12-01 09:15:00-0500"]),
456+
),
457+
("%Y-%m-%d", np.array(["2025-08-09", None, "2025-12-01"])),
458+
(None, np.array(["2025/13/01", "not-a-date", ""])),
459+
(
460+
None,
461+
np.array(["01/02/2025", "2025-02-01"]),
462+
),
463+
],
464+
)
465+
def test_guess_datetime_format_for_array(expected_format: str, array: np.array) -> None:
466+
fmt = _guess_datetime_format_for_array(array, dayfirst=False)
467+
assert fmt == expected_format, f"{fmt} does not match {expected_format}"

0 commit comments

Comments
 (0)