Skip to content

Commit fb517ba

Browse files
authored
ENH: resolution inference for array_to_timedelta64 (#63018)
1 parent b6d67b7 commit fb517ba

File tree

21 files changed

+179
-132
lines changed

21 files changed

+179
-132
lines changed

pandas/_libs/lib.pyx

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,6 @@ from pandas._libs.tslibs.nattype cimport (
106106
)
107107
from pandas._libs.tslibs.offsets cimport is_offset_object
108108
from pandas._libs.tslibs.period cimport is_period_object
109-
from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
110109
from pandas._libs.tslibs.timezones cimport tz_compare
111110

112111
# constants that will be compared to potentially arbitrarily large
@@ -2674,11 +2673,6 @@ def maybe_convert_objects(ndarray[object] objects,
26742673
elif is_timedelta(val):
26752674
if convert_non_numeric:
26762675
seen.timedelta_ = True
2677-
try:
2678-
convert_to_timedelta64(val, "ns")
2679-
except OutOfBoundsTimedelta:
2680-
seen.object_ = True
2681-
break
26822676
break
26832677
else:
26842678
seen.object_ = True

pandas/_libs/tslibs/conversion.pxd

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,6 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1
4545

4646
cpdef datetime localize_pydatetime(datetime dt, tzinfo tz)
4747
cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1
48-
cdef (int64_t, int) precision_from_unit(
49-
NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=*
50-
)
5148

5249
cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso)
5350

pandas/_libs/tslibs/timedeltas.pxd

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ cpdef int64_t get_unit_for_round(freq, NPY_DATETIMEUNIT creso) except? -1
99
cpdef int64_t delta_to_nanoseconds(
1010
delta, NPY_DATETIMEUNIT reso=*, bint round_ok=*
1111
) except? -1
12-
cdef convert_to_timedelta64(object ts, str unit)
1312
cdef bint is_any_td_scalar(object obj)
1413

1514

pandas/_libs/tslibs/timedeltas.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ def array_to_timedelta64(
7171
values: npt.NDArray[np.object_],
7272
unit: str | None = ...,
7373
errors: str = ...,
74+
creso: int = ...,
7475
) -> np.ndarray: ... # np.ndarray[m8ns]
7576
def parse_timedelta_unit(unit: str | None) -> UnitChoices: ...
7677
def delta_to_nanoseconds(

pandas/_libs/tslibs/timedeltas.pyx

Lines changed: 114 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ from pandas._libs.missing cimport checknull_with_nat_and_na
4141
from pandas._libs.tslibs.base cimport ABCTimestamp
4242
from pandas._libs.tslibs.conversion cimport (
4343
cast_from_unit,
44-
precision_from_unit,
4544
)
4645
from pandas._libs.tslibs.dtypes cimport (
4746
c_DEPR_UNITS,
@@ -290,68 +289,6 @@ cpdef int64_t delta_to_nanoseconds(
290289
) from err
291290

292291

293-
@cython.overflowcheck(True)
294-
cdef object ensure_td64ns(object ts):
295-
"""
296-
Overflow-safe implementation of td64.astype("m8[ns]")
297-
298-
Parameters
299-
----------
300-
ts : np.timedelta64
301-
302-
Returns
303-
-------
304-
np.timedelta64[ns]
305-
"""
306-
cdef:
307-
NPY_DATETIMEUNIT td64_unit
308-
int64_t td64_value, mult
309-
310-
td64_unit = get_datetime64_unit(ts)
311-
if (
312-
td64_unit != NPY_DATETIMEUNIT.NPY_FR_ns
313-
and td64_unit != NPY_DATETIMEUNIT.NPY_FR_GENERIC
314-
):
315-
316-
td64_value = cnp.get_timedelta64_value(ts)
317-
318-
mult = precision_from_unit(td64_unit)[0]
319-
try:
320-
# NB: cython#1381 this cannot be *=
321-
td64_value = td64_value * mult
322-
except OverflowError as err:
323-
raise OutOfBoundsTimedelta(ts) from err
324-
325-
return np.timedelta64(td64_value, "ns")
326-
327-
return ts
328-
329-
330-
cdef convert_to_timedelta64(object ts, str unit):
331-
"""
332-
Convert an incoming object to a timedelta64 if possible.
333-
Before calling, unit must be standardized to avoid repeated unit conversion
334-
335-
Handle these types of objects:
336-
- timedelta/Timedelta
337-
338-
Return a timedelta64[ns] object
339-
"""
340-
# Caller is responsible for checking unit not in ["Y", "y", "M"]
341-
if isinstance(ts, _Timedelta):
342-
# already in the proper format
343-
if ts._creso != NPY_FR_ns:
344-
ts = ts.as_unit("ns").asm8
345-
else:
346-
ts = np.timedelta64(ts._value, "ns")
347-
348-
elif PyDelta_Check(ts):
349-
ts = np.timedelta64(delta_to_nanoseconds(ts), "ns")
350-
elif not cnp.is_timedelta64_object(ts):
351-
raise TypeError(f"Invalid type for timedelta scalar: {type(ts)}")
352-
return ts.astype("timedelta64[ns]")
353-
354-
355292
cdef _numeric_to_td64ns(object item, str unit):
356293
# caller is responsible for checking
357294
# assert unit not in ["Y", "y", "M"]
@@ -370,10 +307,34 @@ cdef _numeric_to_td64ns(object item, str unit):
370307
return ts
371308

372309

310+
# TODO: de-duplicate with DatetimeParseState
311+
cdef class ResoState:
312+
cdef:
313+
NPY_DATETIMEUNIT creso
314+
bint creso_ever_changed
315+
316+
def __cinit__(self, NPY_DATETIMEUNIT creso):
317+
self.creso = creso
318+
self.creso_ever_changed = False
319+
320+
cdef bint update_creso(self, NPY_DATETIMEUNIT item_reso) noexcept:
321+
# Return a bool indicating whether we bumped to a higher resolution
322+
if self.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
323+
self.creso = item_reso
324+
elif item_reso > self.creso:
325+
self.creso = item_reso
326+
self.creso_ever_changed = True
327+
return True
328+
return False
329+
330+
373331
@cython.boundscheck(False)
374332
@cython.wraparound(False)
375333
def array_to_timedelta64(
376-
ndarray values, str unit=None, str errors="raise"
334+
ndarray values,
335+
str unit=None,
336+
str errors="raise",
337+
NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_GENERIC,
377338
) -> ndarray:
378339
# values is object-dtype, may be 2D
379340
"""
@@ -395,6 +356,10 @@ def array_to_timedelta64(
395356
cnp.broadcast mi = cnp.PyArray_MultiIterNew2(result, values)
396357
cnp.flatiter it
397358
str parsed_unit = parse_timedelta_unit(unit or "ns")
359+
NPY_DATETIMEUNIT item_reso
360+
ResoState state = ResoState(creso)
361+
bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC
362+
ndarray iresult = result.view("i8")
398363

399364
if values.descr.type_num != cnp.NPY_OBJECT:
400365
# raise here otherwise we segfault below
@@ -422,18 +387,58 @@ def array_to_timedelta64(
422387
ival = NPY_NAT
423388

424389
elif cnp.is_timedelta64_object(item):
425-
td64ns_obj = ensure_td64ns(item)
426-
ival = cnp.get_timedelta64_value(td64ns_obj)
390+
# TODO: de-duplicate this with Timedelta.__new__
391+
ival = cnp.get_timedelta64_value(item)
392+
dt64_reso = get_datetime64_unit(item)
393+
if not (
394+
is_supported_unit(dt64_reso) or
395+
dt64_reso in [
396+
NPY_DATETIMEUNIT.NPY_FR_m,
397+
NPY_DATETIMEUNIT.NPY_FR_h,
398+
NPY_DATETIMEUNIT.NPY_FR_D,
399+
NPY_DATETIMEUNIT.NPY_FR_W,
400+
NPY_DATETIMEUNIT.NPY_FR_GENERIC
401+
]
402+
):
403+
err = npy_unit_to_abbrev(dt64_reso)
404+
raise ValueError(
405+
f"Unit {err} is not supported. "
406+
"Only unambiguous timedelta values durations are supported. "
407+
"Allowed units are 'W', 'D', 'h', 'm', 's', 'ms', 'us', 'ns'")
408+
409+
item_reso = get_supported_reso(dt64_reso)
410+
state.update_creso(item_reso)
411+
if infer_reso:
412+
creso = state.creso
413+
if dt64_reso != NPY_DATETIMEUNIT.NPY_FR_GENERIC:
414+
try:
415+
ival = convert_reso(
416+
ival,
417+
dt64_reso,
418+
creso,
419+
round_ok=True,
420+
)
421+
except (OverflowError, OutOfBoundsDatetime) as err:
422+
raise OutOfBoundsTimedelta(item) from err
423+
else:
424+
# e.g. NaT
425+
pass
427426

428427
elif isinstance(item, _Timedelta):
429-
if item._creso != NPY_FR_ns:
430-
ival = item.as_unit("ns")._value
431-
else:
432-
ival = item._value
428+
item_reso = item._creso
429+
state.update_creso(item_reso)
430+
if infer_reso:
431+
creso = state.creso
432+
433+
ival = (<_Timedelta>item)._as_creso(creso)._value
433434

434435
elif PyDelta_Check(item):
435436
# i.e. isinstance(item, timedelta)
436-
ival = delta_to_nanoseconds(item)
437+
item_reso = NPY_DATETIMEUNIT.NPY_FR_us
438+
state.update_creso(item_reso)
439+
if infer_reso:
440+
creso = state.creso
441+
ival = delta_to_nanoseconds(item, reso=creso)
437442

438443
elif isinstance(item, str):
439444
if (
@@ -444,13 +449,27 @@ def array_to_timedelta64(
444449
else:
445450
ival = parse_timedelta_string(item)
446451

452+
item_reso = NPY_FR_ns
453+
state.update_creso(item_reso)
454+
if infer_reso:
455+
creso = state.creso
456+
447457
elif is_tick_object(item):
448-
ival = item.nanos
458+
item_reso = get_supported_reso(item._creso)
459+
state.update_creso(item_reso)
460+
if infer_reso:
461+
creso = state.creso
462+
ival = delta_to_nanoseconds(item, reso=creso)
449463

450464
elif is_integer_object(item) or is_float_object(item):
451465
td64ns_obj = _numeric_to_td64ns(item, parsed_unit)
452466
ival = cnp.get_timedelta64_value(td64ns_obj)
453467

468+
item_reso = NPY_FR_ns
469+
state.update_creso(item_reso)
470+
if infer_reso:
471+
creso = state.creso
472+
454473
else:
455474
raise TypeError(f"Invalid type for timedelta scalar: {type(item)}")
456475

@@ -468,6 +487,29 @@ def array_to_timedelta64(
468487

469488
cnp.PyArray_MultiIter_NEXT(mi)
470489

490+
if infer_reso:
491+
if state.creso_ever_changed:
492+
# We encountered mismatched resolutions, need to re-parse with
493+
# the correct one.
494+
return array_to_timedelta64(
495+
values,
496+
unit=unit,
497+
errors=errors,
498+
creso=state.creso,
499+
)
500+
elif creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
501+
# i.e. we never encountered anything non-NaT, default to "s". This
502+
# ensures that insert and concat-like operations with NaT
503+
# do not upcast units
504+
result = iresult.view("m8[s]")
505+
else:
506+
# Otherwise we can use the single reso that we encountered and avoid
507+
# a second pass.
508+
abbrev = npy_unit_to_abbrev(state.creso)
509+
result = iresult.view(f"m8[{abbrev}]")
510+
else:
511+
abbrev = npy_unit_to_abbrev(creso)
512+
result = result.view(f"m8[{abbrev}]")
471513
return result
472514

473515

pandas/core/arrays/timedeltas.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1251,7 +1251,7 @@ def _objects_to_td64ns(
12511251
values = np.asarray(data, dtype=np.object_)
12521252

12531253
result = array_to_timedelta64(values, unit=unit, errors=errors)
1254-
return result.view("timedelta64[ns]")
1254+
return result
12551255

12561256

12571257
def _validate_td64_dtype(dtype) -> DtypeObj:

pandas/tests/arithmetic/test_timedelta64.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -736,7 +736,7 @@ def test_tdi_add_overflow(self):
736736
)
737737

738738
# These should not overflow!
739-
exp = TimedeltaIndex([NaT])
739+
exp = TimedeltaIndex([NaT], dtype="m8[ns]")
740740
result = pd.to_timedelta([NaT]) - Timedelta("1 days")
741741
tm.assert_index_equal(result, exp)
742742

@@ -2235,7 +2235,7 @@ def test_float_series_rdiv_td64arr(self, box_with_array, names):
22352235

22362236
def test_td64arr_all_nat_div_object_dtype_numeric(self, box_with_array):
22372237
# GH#39750 make sure we infer the result as td64
2238-
tdi = TimedeltaIndex([NaT, NaT])
2238+
tdi = TimedeltaIndex([NaT, NaT], dtype="m8[ns]")
22392239

22402240
left = tm.box_expected(tdi, box_with_array)
22412241
right = np.array([2, 2.0], dtype=object)

pandas/tests/dtypes/test_inference.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -808,7 +808,7 @@ def test_maybe_convert_objects_datetime(self):
808808
tm.assert_numpy_array_equal(out, exp)
809809

810810
arr = np.array([pd.NaT, np.timedelta64(1, "s")], dtype=object)
811-
exp = np.array([np.timedelta64("NaT"), np.timedelta64(1, "s")], dtype="m8[ns]")
811+
exp = np.array([np.timedelta64("NaT"), np.timedelta64(1, "s")], dtype="m8[s]")
812812
out = lib.maybe_convert_objects(arr, convert_non_numeric=True)
813813
tm.assert_numpy_array_equal(out, exp)
814814

@@ -863,7 +863,7 @@ def test_maybe_convert_objects_datetime_overflow_safe(self, dtype):
863863
if dtype == "datetime64[ns]":
864864
expected = np.array(["2363-10-04"], dtype="M8[us]")
865865
else:
866-
expected = arr
866+
expected = arr.astype("m8[us]")
867867
tm.assert_numpy_array_equal(out, expected)
868868

869869
def test_maybe_convert_objects_mixed_datetimes(self):

pandas/tests/extension/test_arrow.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -591,14 +591,6 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna, request):
591591
if data.dtype._is_numeric:
592592
mark = pytest.mark.xfail(reason="skew not implemented")
593593
request.applymarker(mark)
594-
elif (
595-
op_name in ["std", "sem"]
596-
and pa.types.is_date64(data._pa_array.type)
597-
and skipna
598-
):
599-
# overflow
600-
mark = pytest.mark.xfail(reason="Cannot cast")
601-
request.applymarker(mark)
602594
return super().test_reduce_frame(data, all_numeric_reductions, skipna)
603595

604596
@pytest.mark.parametrize("typ", ["int64", "uint64", "float64"])

pandas/tests/frame/methods/test_dtypes.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def test_dtypes_timedeltas(self):
103103
)
104104
result = df.dtypes
105105
expected = Series(
106-
[np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB")
106+
[np.dtype("datetime64[ns]"), np.dtype("timedelta64[us]")], index=list("AB")
107107
)
108108
tm.assert_series_equal(result, expected)
109109

@@ -112,7 +112,7 @@ def test_dtypes_timedeltas(self):
112112
expected = Series(
113113
[
114114
np.dtype("datetime64[ns]"),
115-
np.dtype("timedelta64[ns]"),
115+
np.dtype("timedelta64[us]"),
116116
np.dtype("datetime64[ns]"),
117117
],
118118
index=list("ABC"),
@@ -125,7 +125,7 @@ def test_dtypes_timedeltas(self):
125125
expected = Series(
126126
[
127127
np.dtype("datetime64[ns]"),
128-
np.dtype("timedelta64[ns]"),
128+
np.dtype("timedelta64[us]"),
129129
np.dtype("datetime64[ns]"),
130130
np.dtype("int64"),
131131
],

0 commit comments

Comments
 (0)