Skip to content

Commit 1893382

Browse files
committed
Merge remote-tracking branch 'upstream' into read-csv-from-directory
2 parents da1c1ed + 3e1d6d5 commit 1893382

File tree

18 files changed

+182
-4759
lines changed

18 files changed

+182
-4759
lines changed

doc/source/user_guide/migration-3-strings.rst

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,37 @@ the :meth:`~pandas.Series.str.decode` method now has a ``dtype`` parameter to be
315315
able to specify object dtype instead of the default of string dtype for this use
316316
case.
317317

318+
:meth:`Series.values` now returns an :class:`~pandas.api.extensions.ExtensionArray`
319+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
320+
321+
With object dtype, using ``.values`` on a Series will return the underlying NumPy array.
322+
323+
.. code-block:: python
324+
325+
>>> ser = pd.Series(["a", "b", np.nan], dtype="object")
326+
>>> type(ser.values)
327+
<class 'numpy.ndarray'>
328+
329+
However with the new string dtype, the underlying ExtensionArray is returned instead.
330+
331+
.. code-block:: python
332+
333+
>>> ser = pd.Series(["a", "b", pd.NA], dtype="str")
334+
>>> ser.values
335+
<ArrowStringArray>
336+
['a', 'b', nan]
337+
Length: 3, dtype: str
338+
339+
If your code requires a NumPy array, you should use :meth:`Series.to_numpy`.
340+
341+
.. code-block:: python
342+
343+
>>> ser = pd.Series(["a", "b", pd.NA], dtype="str")
344+
>>> ser.to_numpy()
345+
['a' 'b' nan]
346+
347+
In general, you should always prefer :meth:`Series.to_numpy` to get a NumPy array or :meth:`Series.array` to get an ExtensionArray over using :meth:`Series.values`.
348+
318349
Notable bug fixes
319350
~~~~~~~~~~~~~~~~~
320351

pandas/_libs/src/datetime/pd_datetime.c

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,23 @@ static int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
5555
out->month = 1;
5656
out->day = 1;
5757

58-
out->year = PyLong_AsLong(PyObject_GetAttrString(obj, "year"));
59-
out->month = PyLong_AsLong(PyObject_GetAttrString(obj, "month"));
60-
out->day = PyLong_AsLong(PyObject_GetAttrString(obj, "day"));
58+
tmp = PyObject_GetAttrString(obj, "year");
59+
if (tmp == NULL)
60+
return -1;
61+
out->year = PyLong_AsLong(tmp);
62+
Py_DECREF(tmp);
63+
64+
tmp = PyObject_GetAttrString(obj, "month");
65+
if (tmp == NULL)
66+
return -1;
67+
out->month = PyLong_AsLong(tmp);
68+
Py_DECREF(tmp);
69+
70+
tmp = PyObject_GetAttrString(obj, "day");
71+
if (tmp == NULL)
72+
return -1;
73+
out->day = PyLong_AsLong(tmp);
74+
Py_DECREF(tmp);
6175

6276
// TODO(anyone): If we can get PyDateTime_IMPORT to work, we could use
6377
// PyDateTime_Check here, and less verbose attribute lookups.
@@ -70,10 +84,29 @@ static int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
7084
return 0;
7185
}
7286

73-
out->hour = PyLong_AsLong(PyObject_GetAttrString(obj, "hour"));
74-
out->min = PyLong_AsLong(PyObject_GetAttrString(obj, "minute"));
75-
out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second"));
76-
out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond"));
87+
tmp = PyObject_GetAttrString(obj, "hour");
88+
if (tmp == NULL)
89+
return -1;
90+
out->hour = PyLong_AsLong(tmp);
91+
Py_DECREF(tmp);
92+
93+
tmp = PyObject_GetAttrString(obj, "minute");
94+
if (tmp == NULL)
95+
return -1;
96+
out->min = PyLong_AsLong(tmp);
97+
Py_DECREF(tmp);
98+
99+
tmp = PyObject_GetAttrString(obj, "second");
100+
if (tmp == NULL)
101+
return -1;
102+
out->sec = PyLong_AsLong(tmp);
103+
Py_DECREF(tmp);
104+
105+
tmp = PyObject_GetAttrString(obj, "microsecond");
106+
if (tmp == NULL)
107+
return -1;
108+
out->us = PyLong_AsLong(tmp);
109+
Py_DECREF(tmp);
77110

78111
if (PyObject_HasAttrString(obj, "tzinfo")) {
79112
PyObject *offset = extract_utc_offset(obj);

pandas/core/construction.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
maybe_cast_to_datetime,
3232
maybe_cast_to_integer_array,
3333
maybe_convert_platform,
34-
maybe_infer_to_datetimelike,
3534
maybe_promote,
3635
)
3736
from pandas.core.dtypes.common import (
@@ -612,7 +611,15 @@ def sanitize_array(
612611
if dtype is None:
613612
subarr = data
614613
if data.dtype == object and infer_object:
615-
subarr = maybe_infer_to_datetimelike(data)
614+
subarr = lib.maybe_convert_objects(
615+
data,
616+
# Here we do not convert numeric dtypes, as if we wanted that,
617+
# numpy would have done it for us.
618+
convert_numeric=False,
619+
convert_non_numeric=True,
620+
convert_to_nullable_dtype=False,
621+
dtype_if_all_nat=np.dtype("M8[s]"),
622+
)
616623
elif data.dtype.kind == "U" and using_string_dtype():
617624
from pandas.core.arrays.string_ import StringDtype
618625

@@ -659,7 +666,15 @@ def sanitize_array(
659666
subarr = maybe_convert_platform(data)
660667
if subarr.dtype == object:
661668
subarr = cast(np.ndarray, subarr)
662-
subarr = maybe_infer_to_datetimelike(subarr)
669+
subarr = lib.maybe_convert_objects(
670+
subarr,
671+
# Here we do not convert numeric dtypes, as if we wanted that,
672+
# numpy would have done it for us.
673+
convert_numeric=False,
674+
convert_non_numeric=True,
675+
convert_to_nullable_dtype=False,
676+
dtype_if_all_nat=np.dtype("M8[s]"),
677+
)
663678

664679
subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
665680

pandas/core/dtypes/cast.py

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,6 @@
9797
DtypeObj,
9898
NumpyIndexT,
9999
Scalar,
100-
npt,
101100
)
102101

103102
from pandas import Index
@@ -1058,51 +1057,6 @@ def convert_dtypes(
10581057
return inferred_dtype # type: ignore[return-value]
10591058

10601059

1061-
def maybe_infer_to_datetimelike(
1062-
value: npt.NDArray[np.object_],
1063-
convert_to_nullable_dtype: bool = False,
1064-
) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray:
1065-
"""
1066-
we might have a array (or single object) that is datetime like,
1067-
and no dtype is passed don't change the value unless we find a
1068-
datetime/timedelta set
1069-
1070-
this is pretty strict in that a datetime/timedelta is REQUIRED
1071-
in addition to possible nulls/string likes
1072-
1073-
Parameters
1074-
----------
1075-
value : np.ndarray[object]
1076-
1077-
Returns
1078-
-------
1079-
np.ndarray, DatetimeArray, TimedeltaArray, PeriodArray, or IntervalArray
1080-
1081-
"""
1082-
if not isinstance(value, np.ndarray) or value.dtype != object:
1083-
# Caller is responsible for passing only ndarray[object]
1084-
raise TypeError(type(value)) # pragma: no cover
1085-
if value.ndim != 1:
1086-
# Caller is responsible
1087-
raise ValueError(value.ndim) # pragma: no cover
1088-
1089-
if not len(value):
1090-
return value
1091-
1092-
# error: Incompatible return value type (got "Union[ExtensionArray,
1093-
# ndarray[Any, Any]]", expected "Union[ndarray[Any, Any], DatetimeArray,
1094-
# TimedeltaArray, PeriodArray, IntervalArray]")
1095-
return lib.maybe_convert_objects( # type: ignore[return-value]
1096-
value,
1097-
# Here we do not convert numeric dtypes, as if we wanted that,
1098-
# numpy would have done it for us.
1099-
convert_numeric=False,
1100-
convert_non_numeric=True,
1101-
convert_to_nullable_dtype=convert_to_nullable_dtype,
1102-
dtype_if_all_nat=np.dtype("M8[s]"),
1103-
)
1104-
1105-
11061060
def maybe_cast_to_datetime(
11071061
value: np.ndarray | list, dtype: np.dtype
11081062
) -> DatetimeArray | TimedeltaArray | np.ndarray:

pandas/core/frame.py

Lines changed: 16 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@
7171
)
7272
from pandas.util._exceptions import (
7373
find_stack_level,
74-
rewrite_warning,
7574
)
7675
from pandas.util._validators import (
7776
validate_ascending,
@@ -11926,25 +11925,13 @@ def _get_data() -> DataFrame:
1192611925
row_index = np.tile(np.arange(nrows), ncols)
1192711926
col_index = np.repeat(np.arange(ncols), nrows)
1192811927
ser = Series(arr, index=col_index, copy=False)
11929-
# GroupBy will raise a warning with SeriesGroupBy as the object,
11930-
# likely confusing users
11931-
with rewrite_warning(
11932-
target_message=(
11933-
f"The behavior of SeriesGroupBy.{name} with all-NA values"
11934-
),
11935-
target_category=FutureWarning,
11936-
new_message=(
11937-
f"The behavior of {type(self).__name__}.{name} with all-NA "
11938-
"values, or any-NA and skipna=False, is deprecated. In "
11939-
"a future version this will raise ValueError"
11940-
),
11941-
):
11942-
result = ser.groupby(row_index).agg(name, **kwds)
11928+
if name == "all":
11929+
# Behavior here appears incorrect; preserving
11930+
# for backwards compatibility for now.
11931+
# See https://github.com/pandas-dev/pandas/issues/57171
11932+
skipna = True
11933+
result = ser.groupby(row_index).agg(name, **kwds, skipna=skipna)
1194311934
result.index = df.index
11944-
if not skipna and name not in ("any", "all"):
11945-
mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1)
11946-
other = -1 if name in ("idxmax", "idxmin") else lib.no_default
11947-
result = result.mask(mask, other)
1194811935
return result
1194911936

1195011937
df = df.T
@@ -13258,13 +13245,11 @@ def idxmin(
1325813245
# indices will always be np.ndarray since axis is not N
1325913246

1326013247
if (indices == -1).any():
13261-
warnings.warn(
13262-
f"The behavior of {type(self).__name__}.idxmin with all-NA "
13263-
"values, or any-NA and skipna=False, is deprecated. In a future "
13264-
"version this will raise ValueError",
13265-
FutureWarning,
13266-
stacklevel=find_stack_level(),
13267-
)
13248+
if skipna:
13249+
msg = "Encountered all NA values"
13250+
else:
13251+
msg = "Encountered an NA values with skipna=False"
13252+
raise ValueError(msg)
1326813253

1326913254
index = data._get_axis(axis)
1327013255
result = algorithms.take(
@@ -13365,13 +13350,11 @@ def idxmax(
1336513350
# indices will always be 1d array since axis is not None
1336613351

1336713352
if (indices == -1).any():
13368-
warnings.warn(
13369-
f"The behavior of {type(self).__name__}.idxmax with all-NA "
13370-
"values, or any-NA and skipna=False, is deprecated. In a future "
13371-
"version this will raise ValueError",
13372-
FutureWarning,
13373-
stacklevel=find_stack_level(),
13374-
)
13353+
if skipna:
13354+
msg = "Encountered all NA values"
13355+
else:
13356+
msg = "Encountered an NA values with skipna=False"
13357+
raise ValueError(msg)
1337513358

1337613359
index = data._get_axis(axis)
1337713360
result = algorithms.take(

pandas/core/groupby/groupby.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5703,10 +5703,7 @@ def _idxmax_idxmin(
57035703
"Specify observed=True in groupby instead."
57045704
)
57055705
elif not skipna and self._obj_with_exclusions.isna().any(axis=None):
5706-
raise ValueError(
5707-
f"{type(self).__name__}.{how} with skipna=False encountered an NA "
5708-
f"value."
5709-
)
5706+
raise ValueError(f"{how} with skipna=False encountered an NA value.")
57105707

57115708
result = self._agg_general(
57125709
numeric_only=numeric_only,
@@ -5724,8 +5721,7 @@ def _wrap_idxmax_idxmin(
57245721
result = res.astype(index.dtype)
57255722
elif skipna and res.lt(0).any(axis=None):
57265723
raise ValueError(
5727-
f"{type(self).__name__}.{how} with skipna=True encountered all NA "
5728-
f"values in a group."
5724+
f"{how} with skipna=True encountered all NA values in a group."
57295725
)
57305726
else:
57315727
if isinstance(index, MultiIndex):

pandas/core/internals/construction.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
dict_compat,
2525
maybe_cast_to_datetime,
2626
maybe_convert_platform,
27-
maybe_infer_to_datetimelike,
2827
)
2928
from pandas.core.dtypes.common import (
3029
is_1d_only_ea_dtype,
@@ -290,7 +289,18 @@ def ndarray_to_mgr(
290289
# embedded in an object type
291290
if dtype is None and infer_object and is_object_dtype(values.dtype):
292291
obj_columns = list(values)
293-
maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]
292+
maybe_datetime = [
293+
lib.maybe_convert_objects(
294+
x,
295+
# Here we do not convert numeric dtypes, as if we wanted that,
296+
# numpy would have done it for us.
297+
convert_numeric=False,
298+
convert_non_numeric=True,
299+
convert_to_nullable_dtype=False,
300+
dtype_if_all_nat=np.dtype("M8[s]"),
301+
)
302+
for x in obj_columns
303+
]
294304
# don't convert (and copy) the objects if no type inference occurs
295305
if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):
296306
block_values = [
@@ -485,7 +495,7 @@ def convert(v):
485495

486496
v = extract_array(v, extract_numpy=True)
487497
res = maybe_convert_platform(v)
488-
# We don't do maybe_infer_to_datetimelike here bc we will end up doing
498+
# We don't do maybe_infer_objects here bc we will end up doing
489499
# it column-by-column in ndarray_to_mgr
490500
return res
491501

@@ -965,7 +975,15 @@ def convert(arr):
965975
if arr.dtype == np.dtype("O"):
966976
# i.e. maybe_convert_objects didn't convert
967977
convert_to_nullable_dtype = dtype_backend != "numpy"
968-
arr = maybe_infer_to_datetimelike(arr, convert_to_nullable_dtype)
978+
arr = lib.maybe_convert_objects(
979+
arr,
980+
# Here we do not convert numeric dtypes, as if we wanted that,
981+
# numpy would have done it for us.
982+
convert_numeric=False,
983+
convert_non_numeric=True,
984+
convert_to_nullable_dtype=convert_to_nullable_dtype,
985+
dtype_if_all_nat=np.dtype("M8[s]"),
986+
)
969987
if convert_to_nullable_dtype and arr.dtype == np.dtype("O"):
970988
new_dtype = StringDtype()
971989
arr_cls = new_dtype.construct_array_type()

pandas/tests/extension/test_arrow.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3333,9 +3333,9 @@ def test_factorize_chunked_dictionary():
33333333
)
33343334
ser = pd.Series(ArrowExtensionArray(pa_array))
33353335
res_indices, res_uniques = ser.factorize()
3336-
exp_indicies = np.array([0, 1], dtype=np.intp)
3336+
exp_indices = np.array([0, 1], dtype=np.intp)
33373337
exp_uniques = pd.Index(ArrowExtensionArray(pa_array.combine_chunks()))
3338-
tm.assert_numpy_array_equal(res_indices, exp_indicies)
3338+
tm.assert_numpy_array_equal(res_indices, exp_indices)
33393339
tm.assert_index_equal(res_uniques, exp_uniques)
33403340

33413341

pandas/tests/frame/test_reductions.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2160,9 +2160,7 @@ def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype):
21602160
kwargs["min_count"] = min_count
21612161

21622162
if not skipna and method in ("idxmax", "idxmin"):
2163-
# GH#57745 - EAs use groupby for axis=1 which still needs a proper deprecation.
2164-
msg = f"The behavior of DataFrame.{method} with all-NA values"
2165-
with tm.assert_produces_warning(FutureWarning, match=msg):
2163+
with pytest.raises(ValueError, match="encountered an NA value"):
21662164
getattr(df, method)(axis=1, **kwargs)
21672165
with pytest.raises(ValueError, match="Encountered an NA value"):
21682166
getattr(expected_df, method)(axis=1, **kwargs)

pandas/tests/groupby/test_libgroupby.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
285285
)
286286

287287

288-
def test_cython_group_mean_Inf_at_begining_and_end():
288+
def test_cython_group_mean_Inf_at_beginning_and_end():
289289
# GH 50367
290290
actual = np.array([[np.nan, np.nan], [np.nan, np.nan]], dtype="float64")
291291
counts = np.array([0, 0], dtype="int64")
@@ -314,7 +314,7 @@ def test_cython_group_mean_Inf_at_begining_and_end():
314314
([[np.inf], [-np.inf], [-np.inf]], [[np.inf], [-np.inf]]),
315315
],
316316
)
317-
def test_cython_group_sum_Inf_at_begining_and_end(values, out):
317+
def test_cython_group_sum_Inf_at_beginning_and_end(values, out):
318318
# GH #53606
319319
actual = np.array([[np.nan], [np.nan]], dtype="float64")
320320
counts = np.array([0, 0], dtype="int64")

0 commit comments

Comments
 (0)