Skip to content

Commit 945385d

Browse files
API: microsecond resolution for Timedelta strings (#63196)
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
1 parent cbe9c99 commit 945385d

File tree

68 files changed

+340
-218
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+340
-218
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,8 @@ In cases with mixed-resolution inputs, the highest resolution is used:
384384
385385
.. warning:: Many users will now get "M8[us]" dtype data in cases when they used to get "M8[ns]". For most use cases they should not notice a difference. One big exception is converting to integers, which will give integers 1000x smaller.
386386

387+
Similarly, the :class:`Timedelta` constructor and :func:`to_timedelta` with a string input now defaults to a microsecond unit, using nanosecond unit only in cases that actually have nanosecond precision.
388+
387389
.. _whatsnew_300.api_breaking.concat_datetime_sorting:
388390

389391
:func:`concat` no longer ignores ``sort`` when all objects have a :class:`DatetimeIndex`

pandas/_libs/tslibs/timedeltas.pyx

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import collections
2+
import re
23
import warnings
34

45
from pandas.util._decorators import set_module
@@ -448,11 +449,19 @@ def array_to_timedelta64(
448449
ival = parse_iso_format_string(item)
449450
else:
450451
ival = parse_timedelta_string(item)
452+
if (
453+
(infer_reso or creso == NPY_DATETIMEUNIT.NPY_FR_us)
454+
and not needs_nano_unit(ival, item)
455+
):
456+
item_reso = NPY_DATETIMEUNIT.NPY_FR_us
457+
ival = ival // 1000
458+
else:
459+
item_reso = NPY_FR_ns
451460

452-
item_reso = NPY_FR_ns
453-
state.update_creso(item_reso)
454-
if infer_reso:
455-
creso = state.creso
461+
if ival != NPY_NAT:
462+
state.update_creso(item_reso)
463+
if infer_reso:
464+
creso = state.creso
456465

457466
elif is_tick_object(item):
458467
item_reso = get_supported_reso(item._creso)
@@ -722,6 +731,24 @@ cdef timedelta_from_spec(object number, object frac, object unit):
722731
return cast_from_unit(float(n), unit)
723732

724733

734+
cdef bint needs_nano_unit(int64_t ival, str item):
735+
"""
736+
Check if a passed string `item` needs to be stored with nano unit or can
737+
use microsecond instead. Needs nanoseconds if:
738+
739+
- if the parsed value in nanoseconds has sub-microseconds content -> certainly
740+
needs nano
741+
- if the seconds part in the string contains more than 6 decimals, i.e. has
742+
trailing zeros beyond the microsecond part (e.g. "0.123456000 s") -> treat
743+
as nano for consistency
744+
- if the string explicitly contains an entry for nanoseconds (e.g. "1000 ns")
745+
"""
746+
# TODO: more performant way of doing this check?
747+
if ival % 1000 != 0:
748+
return True
749+
return re.search(r"\.\d{7}", item) or "ns" in item or "nano" in item.lower()
750+
751+
725752
cpdef inline str parse_timedelta_unit(str unit):
726753
"""
727754
Parameters
@@ -2121,10 +2148,17 @@ class Timedelta(_Timedelta):
21212148
if (len(value) > 0 and value[0] == "P") or (
21222149
len(value) > 1 and value[:2] == "-P"
21232150
):
2124-
value = parse_iso_format_string(value)
2151+
ival = parse_iso_format_string(value)
2152+
else:
2153+
ival = parse_timedelta_string(value)
2154+
2155+
if not needs_nano_unit(ival, value):
2156+
# If we don't specifically need nanosecond resolution, default
2157+
# to microsecond like we do for datetimes
2158+
value = np.timedelta64(ival // 1000, "us")
2159+
return cls(value)
21252160
else:
2126-
value = parse_timedelta_string(value)
2127-
value = np.timedelta64(value)
2161+
value = np.timedelta64(ival, "ns")
21282162
elif PyDelta_Check(value):
21292163
# pytimedelta object -> microsecond resolution
21302164
new_value = delta_to_nanoseconds(

pandas/core/arrays/datetimelike.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -927,7 +927,7 @@ def inferred_freq(self) -> str | None:
927927
>>> tdelta_idx = pd.to_timedelta(["0 days", "10 days", "20 days"])
928928
>>> tdelta_idx
929929
TimedeltaIndex(['0 days', '10 days', '20 days'],
930-
dtype='timedelta64[ns]', freq=None)
930+
dtype='timedelta64[us]', freq=None)
931931
>>> tdelta_idx.inferred_freq
932932
'10D'
933933
"""

pandas/core/arrays/timedeltas.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ class TimedeltaArray(dtl.TimelikeOps):
150150
>>> pd.arrays.TimedeltaArray._from_sequence(pd.TimedeltaIndex(["1h", "2h"]))
151151
<TimedeltaArray>
152152
['0 days 01:00:00', '0 days 02:00:00']
153-
Length: 2, dtype: timedelta64[ns]
153+
Length: 2, dtype: timedelta64[us]
154154
"""
155155

156156
_typ = "timedeltaarray"
@@ -813,7 +813,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]:
813813
>>> idx = pd.to_timedelta(np.arange(5), unit="D")
814814
>>> idx
815815
TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'],
816-
dtype='timedelta64[ns]', freq=None)
816+
dtype='timedelta64[us]', freq=None)
817817
818818
>>> idx.total_seconds()
819819
Index([0.0, 86400.0, 172800.0, 259200.0, 345600.0], dtype='float64')
@@ -892,7 +892,7 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]:
892892
>>> tdelta_idx = pd.to_timedelta(["0 days", "10 days", "20 days"])
893893
>>> tdelta_idx
894894
TimedeltaIndex(['0 days', '10 days', '20 days'],
895-
dtype='timedelta64[ns]', freq=None)
895+
dtype='timedelta64[us]', freq=None)
896896
>>> tdelta_idx.days
897897
Index([0, 10, 20], dtype='int64')"""
898898
)

pandas/core/dtypes/astype.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def _astype_nansafe(
117117
# bc we know arr.dtype == object, this is equivalent to
118118
# `np.asarray(to_timedelta(arr))`, but using a lower-level API that
119119
# does not require a circular import.
120-
tdvals = array_to_timedelta64(arr).view("m8[ns]")
120+
tdvals = array_to_timedelta64(arr)
121121

122122
tda = ensure_wrapped_if_datetimelike(tdvals)
123123
return tda.astype(dtype, copy=False)._ndarray

pandas/core/frame.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8657,7 +8657,8 @@ def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt):
86578657
rvalues = series._values
86588658
if not isinstance(rvalues, np.ndarray):
86598659
# TODO(EA2D): no need to special-case with 2D EAs
8660-
if rvalues.dtype in ("datetime64[ns]", "timedelta64[ns]"):
8660+
if lib.is_np_dtype(rvalues.dtype, "mM"):
8661+
# i.e. DatetimeArray[tznaive] or TimedeltaArray
86618662
# We can losslessly+cheaply cast to ndarray
86628663
rvalues = np.asarray(rvalues)
86638664
else:

pandas/core/generic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1573,7 +1573,7 @@ def abs(self) -> Self:
15731573
>>> s = pd.Series([pd.Timedelta("1 days")])
15741574
>>> s.abs()
15751575
0 1 days
1576-
dtype: timedelta64[ns]
1576+
dtype: timedelta64[us]
15771577
15781578
Select rows with data closest to certain value using argsort (from
15791579
`StackOverflow <https://stackoverflow.com/a/17758115>`__).

pandas/core/indexes/accessors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,7 @@ class TimedeltaProperties(Properties):
453453
0 0 days 00:00:01
454454
1 0 days 00:00:02
455455
2 0 days 00:00:03
456-
dtype: timedelta64[ns]
456+
dtype: timedelta64[us]
457457
>>> seconds_series.dt.seconds
458458
0 1
459459
1 2

pandas/core/indexes/datetimelike.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,7 @@ def inferred_freq(self) -> str | None:
691691
>>> tdelta_idx = pd.to_timedelta(["0 days", "10 days", "20 days"])
692692
>>> tdelta_idx
693693
TimedeltaIndex(['0 days', '10 days', '20 days'],
694-
dtype='timedelta64[ns]', freq=None)
694+
dtype='timedelta64[us]', freq=None)
695695
>>> tdelta_idx.inferred_freq
696696
'10D'
697697
"""

pandas/core/indexes/timedeltas.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ class TimedeltaIndex(DatetimeTimedeltaMixin):
122122
--------
123123
>>> pd.TimedeltaIndex(["0 days", "1 days", "2 days", "3 days", "4 days"])
124124
TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'],
125-
dtype='timedelta64[ns]', freq=None)
125+
dtype='timedelta64[us]', freq=None)
126126
127127
We can also let pandas infer the frequency when possible.
128128
@@ -230,18 +230,27 @@ def get_loc(self, key):
230230

231231
return Index.get_loc(self, key)
232232

233-
# error: Return type "tuple[Timedelta | NaTType, None]" of "_parse_with_reso"
234-
# incompatible with return type "tuple[datetime, Resolution]" in supertype
235-
# "DatetimeIndexOpsMixin"
236-
def _parse_with_reso(self, label: str) -> tuple[Timedelta | NaTType, None]: # type: ignore[override]
237-
# the "with_reso" is a no-op for TimedeltaIndex
233+
# error: Return type "tuple[Timedelta | NaTType, Resolution]" of
234+
# "_parse_with_reso" incompatible with return type
235+
# "tuple[datetime, Resolution]" in supertype
236+
# "pandas.core.indexes.datetimelike.DatetimeIndexOpsMixin"
237+
def _parse_with_reso(self, label: str) -> tuple[Timedelta | NaTType, Resolution]: # type: ignore[override]
238238
parsed = Timedelta(label)
239-
return parsed, None
239+
if isinstance(parsed, Timedelta):
240+
reso = Resolution.get_reso_from_freqstr(parsed.unit)
241+
else:
242+
# i.e. pd.NaT
243+
reso = Resolution.get_reso_from_freqstr("s")
244+
return parsed, reso
240245

241-
def _parsed_string_to_bounds(self, reso, parsed: Timedelta):
246+
def _parsed_string_to_bounds(self, reso: Resolution, parsed: Timedelta):
242247
# reso is unused, included to match signature of DTI/PI
243248
lbound = parsed.round(parsed.resolution_string)
244-
rbound = lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns")
249+
rbound = (
250+
lbound
251+
+ to_offset(parsed.resolution_string)
252+
- Timedelta(1, unit=self.unit).as_unit(self.unit)
253+
)
245254
return lbound, rbound
246255

247256
# -------------------------------------------------------------------
@@ -314,14 +323,14 @@ def timedelta_range(
314323
--------
315324
>>> pd.timedelta_range(start="1 day", periods=4)
316325
TimedeltaIndex(['1 days', '2 days', '3 days', '4 days'],
317-
dtype='timedelta64[ns]', freq='D')
326+
dtype='timedelta64[us]', freq='D')
318327
319328
The ``closed`` parameter specifies which endpoint is included. The default
320329
behavior is to include both endpoints.
321330
322331
>>> pd.timedelta_range(start="1 day", periods=4, closed="right")
323332
TimedeltaIndex(['2 days', '3 days', '4 days'],
324-
dtype='timedelta64[ns]', freq='D')
333+
dtype='timedelta64[us]', freq='D')
325334
326335
The ``freq`` parameter specifies the frequency of the TimedeltaIndex.
327336
Only fixed frequencies can be passed, non-fixed frequencies such as
@@ -330,15 +339,15 @@ def timedelta_range(
330339
>>> pd.timedelta_range(start="1 day", end="2 days", freq="6h")
331340
TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00',
332341
'1 days 18:00:00', '2 days 00:00:00'],
333-
dtype='timedelta64[ns]', freq='6h')
342+
dtype='timedelta64[us]', freq='6h')
334343
335344
Specify ``start``, ``end``, and ``periods``; the frequency is generated
336345
automatically (linearly spaced).
337346
338347
>>> pd.timedelta_range(start="1 day", end="5 days", periods=4)
339348
TimedeltaIndex(['1 days 00:00:00', '2 days 08:00:00', '3 days 16:00:00',
340349
'5 days 00:00:00'],
341-
dtype='timedelta64[ns]', freq=None)
350+
dtype='timedelta64[us]', freq=None)
342351
343352
**Specify a unit**
344353

0 commit comments

Comments
 (0)