Skip to content

Commit 58d075c

Browse files
committed
Completed implementation, testing and documentation
1 parent 2d73d62 commit 58d075c

File tree

6 files changed

+183
-18
lines changed

6 files changed

+183
-18
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ Other enhancements
191191
- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
192192
- :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`)
193193
- :meth:`DataFrame.to_json` now encodes ``Decimal`` as strings instead of floats (:issue:`60698`)
194+
- :meth:`DataFrame.unstack` and :meth:`Series.unstack` now support a ``no_fill`` parameter that raises a ``ValueError`` if any missing values would need to be filled during the unstack operation, allowing users to enforce data integrity when a complete 1:1 mapping between stacked and unstacked representations is expected (:issue:`XXXXX`)
194195
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
195196
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
196197
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)

pandas/core/frame.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10320,7 +10320,11 @@ def explode(
1032010320
return result.__finalize__(self, method="explode")
1032110321

1032210322
def unstack(
10323-
self, level: IndexLabel = -1, fill_value=None, sort: bool = True
10323+
self,
10324+
level: IndexLabel = -1,
10325+
fill_value=None,
10326+
sort: bool = True,
10327+
no_fill: bool = False,
1032410328
) -> DataFrame | Series:
1032510329
"""
1032610330
Pivot a level of the (necessarily hierarchical) index labels.
@@ -10339,13 +10343,25 @@ def unstack(
1033910343
Replace NaN with this value if the unstack produces missing values.
1034010344
sort : bool, default True
1034110345
Sort the level(s) in the resulting MultiIndex columns.
10346+
no_fill : bool, default False
10347+
If True, raise a ValueError if any missing values would need to be filled.
10348+
This is useful to ensure data integrity when you expect a complete
10349+
1:1 mapping between stacked and unstacked representations.
10350+
10351+
.. versionadded:: 3.0.0
1034210352
1034310353
Returns
1034410354
-------
1034510355
Series or DataFrame
1034610356
If index is a MultiIndex: DataFrame with pivoted index labels as new
1034710357
inner-most level column labels, else Series.
1034810358
10359+
Raises
10360+
------
10361+
ValueError
10362+
If `no_fill` is True and the unstacking operation would require filling
10363+
missing values.
10364+
1034910365
See Also
1035010366
--------
1035110367
DataFrame.pivot : Pivot a table based on column values.
@@ -10389,7 +10405,7 @@ def unstack(
1038910405
"""
1039010406
from pandas.core.reshape.reshape import unstack
1039110407

10392-
result = unstack(self, level, fill_value, sort)
10408+
result = unstack(self, level, fill_value, sort, no_fill)
1039310409

1039410410
return result.__finalize__(self, method="unstack")
1039510411

pandas/core/reshape/reshape.py

Lines changed: 70 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,16 @@ class _Unstacker:
119119
"""
120120

121121
def __init__(
122-
self, index: MultiIndex, level: Level, constructor, sort: bool = True
122+
self,
123+
index: MultiIndex,
124+
level: Level,
125+
constructor,
126+
sort: bool = True,
127+
no_fill: bool = False,
123128
) -> None:
124129
self.constructor = constructor
125130
self.sort = sort
131+
self.no_fill = no_fill
126132

127133
self.index = index.remove_unused_levels()
128134

@@ -290,6 +296,29 @@ def get_new_values(self, values, fill_value=None):
290296
mask = self.mask
291297
mask_all = self.mask_all
292298

299+
if self.no_fill and not mask_all:
300+
missing_positions = np.where(~mask)[0]
301+
if len(missing_positions) > 0:
302+
first_missing = missing_positions[0]
303+
row_idx = first_missing // width
304+
col_idx = first_missing % width
305+
306+
index_label = (
307+
self.new_index[row_idx]
308+
if row_idx < len(self.new_index)
309+
else row_idx
310+
)
311+
col_label = (
312+
self.removed_level[col_idx]
313+
if col_idx < len(self.removed_level)
314+
else col_idx
315+
)
316+
317+
raise ValueError(
318+
f"Cannot unstack with no_fill=True because filling is required. "
319+
f"Missing value at index {index_label}, column {col_label}."
320+
)
321+
293322
# we can simply reshape if we don't have a mask
294323
if mask_all and len(values):
295324
# TODO: Under what circumstances can we rely on sorted_values
@@ -457,7 +486,11 @@ def new_index(self) -> MultiIndex | Index:
457486

458487

459488
def _unstack_multiple(
460-
data: Series | DataFrame, clocs, fill_value=None, sort: bool = True
489+
data: Series | DataFrame,
490+
clocs,
491+
fill_value=None,
492+
sort: bool = True,
493+
no_fill: bool = False,
461494
):
462495
if len(clocs) == 0:
463496
return data
@@ -503,7 +536,9 @@ def _unstack_multiple(
503536
dummy = data.copy(deep=False)
504537
dummy.index = dummy_index
505538

506-
unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort)
539+
unstacked = dummy.unstack(
540+
"__placeholder__", fill_value=fill_value, sort=sort, no_fill=no_fill
541+
)
507542
new_levels = clevels
508543
new_names = cnames
509544
new_codes = recons_codes
@@ -515,7 +550,7 @@ def _unstack_multiple(
515550
# error: Incompatible types in assignment (expression has type
516551
# "DataFrame | Series", variable has type "DataFrame")
517552
result = result.unstack( # type: ignore[assignment]
518-
val, fill_value=fill_value, sort=sort
553+
val, fill_value=fill_value, sort=sort, no_fill=no_fill
519554
)
520555
clocs = [v if v < val else v - 1 for v in clocs]
521556

@@ -528,7 +563,7 @@ def _unstack_multiple(
528563
# error: Incompatible types in assignment (expression has type "DataFrame |
529564
# Series", variable has type "DataFrame")
530565
unstacked = dummy_df.unstack( # type: ignore[assignment]
531-
"__placeholder__", fill_value=fill_value, sort=sort
566+
"__placeholder__", fill_value=fill_value, sort=sort, no_fill=no_fill
532567
)
533568
if isinstance(unstacked, Series):
534569
unstcols = unstacked.index
@@ -559,18 +594,28 @@ def unstack(obj: Series, level, fill_value=..., sort: bool = ...) -> DataFrame:
559594

560595
@overload
561596
def unstack(
562-
obj: Series | DataFrame, level, fill_value=..., sort: bool = ...
597+
obj: Series | DataFrame,
598+
level,
599+
fill_value=...,
600+
sort: bool = ...,
601+
no_fill: bool = ...,
563602
) -> Series | DataFrame: ...
564603

565604

566605
def unstack(
567-
obj: Series | DataFrame, level, fill_value=None, sort: bool = True
606+
obj: Series | DataFrame,
607+
level,
608+
fill_value=None,
609+
sort: bool = True,
610+
no_fill: bool = False,
568611
) -> Series | DataFrame:
569612
if isinstance(level, (tuple, list)):
570613
if len(level) != 1:
571614
# _unstack_multiple only handles MultiIndexes,
572615
# and isn't needed for a single level
573-
return _unstack_multiple(obj, level, fill_value=fill_value, sort=sort)
616+
return _unstack_multiple(
617+
obj, level, fill_value=fill_value, sort=sort, no_fill=no_fill
618+
)
574619
else:
575620
level = level[0]
576621

@@ -580,7 +625,9 @@ def unstack(
580625

581626
if isinstance(obj, DataFrame):
582627
if isinstance(obj.index, MultiIndex):
583-
return _unstack_frame(obj, level, fill_value=fill_value, sort=sort)
628+
return _unstack_frame(
629+
obj, level, fill_value=fill_value, sort=sort, no_fill=no_fill
630+
)
584631
else:
585632
return obj.T.stack()
586633
elif not isinstance(obj.index, MultiIndex):
@@ -592,19 +639,25 @@ def unstack(
592639
)
593640
else:
594641
if is_1d_only_ea_dtype(obj.dtype):
595-
return _unstack_extension_series(obj, level, fill_value, sort=sort)
642+
return _unstack_extension_series(
643+
obj, level, fill_value, sort=sort, no_fill=no_fill
644+
)
596645
unstacker = _Unstacker(
597-
obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort
646+
obj.index,
647+
level=level,
648+
constructor=obj._constructor_expanddim,
649+
sort=sort,
650+
no_fill=no_fill,
598651
)
599652
return unstacker.get_result(obj, value_columns=None, fill_value=fill_value)
600653

601654

602655
def _unstack_frame(
603-
obj: DataFrame, level, fill_value=None, sort: bool = True
656+
obj: DataFrame, level, fill_value=None, sort: bool = True, no_fill: bool = False
604657
) -> DataFrame:
605658
assert isinstance(obj.index, MultiIndex) # checked by caller
606659
unstacker = _Unstacker(
607-
obj.index, level=level, constructor=obj._constructor, sort=sort
660+
obj.index, level=level, constructor=obj._constructor, sort=sort, no_fill=no_fill
608661
)
609662

610663
if not obj._can_fast_transpose:
@@ -617,7 +670,7 @@ def _unstack_frame(
617670

618671

619672
def _unstack_extension_series(
620-
series: Series, level, fill_value, sort: bool
673+
series: Series, level, fill_value, sort: bool, no_fill: bool = False
621674
) -> DataFrame:
622675
"""
623676
Unstack an ExtensionArray-backed Series.
@@ -636,6 +689,8 @@ def _unstack_extension_series(
636689
``series.values.take``.
637690
sort : bool
638691
Whether to sort the resulting MuliIndex levels
692+
no_fill : bool, default False
693+
Whether to raise an error if any missing values are encountered
639694
640695
Returns
641696
-------
@@ -645,7 +700,7 @@ def _unstack_extension_series(
645700
"""
646701
# Defer to the logic in ExtensionBlock._unstack
647702
df = series.to_frame()
648-
result = df.unstack(level=level, fill_value=fill_value, sort=sort)
703+
result = df.unstack(level=level, fill_value=fill_value, sort=sort, no_fill=no_fill)
649704

650705
# equiv: result.droplevel(level=0, axis=1)
651706
# but this avoids an extra copy

pandas/core/series.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4282,6 +4282,7 @@ def unstack(
42824282
level: IndexLabel = -1,
42834283
fill_value: Hashable | None = None,
42844284
sort: bool = True,
4285+
no_fill: bool = False,
42854286
) -> DataFrame:
42864287
"""
42874288
Unstack, also known as pivot, Series with MultiIndex to produce DataFrame.
@@ -4294,6 +4295,10 @@ def unstack(
42944295
Value to use when replacing NaN values.
42954296
sort : bool, default True
42964297
Sort the level(s) in the resulting MultiIndex columns.
4298+
no_fill : bool, default False
4299+
If True, raise a ValueError if any missing values would need to be filled.
4300+
This is useful to ensure data integrity when you expect a complete
4301+
1:1 mapping between stacked and unstacked representations.
42974302
42984303
Returns
42994304
-------
@@ -4333,7 +4338,7 @@ def unstack(
43334338
"""
43344339
from pandas.core.reshape.reshape import unstack
43354340

4336-
return unstack(self, level, fill_value, sort)
4341+
return unstack(self, level, fill_value, sort, no_fill)
43374342

43384343
# ----------------------------------------------------------------------
43394344
# function application

pandas/tests/frame/test_stack_unstack.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2779,3 +2779,56 @@ def test_stack_preserves_na(dtype, na_value, test_multiindex):
27792779
)
27802780
expected = Series(1, index=expected_index)
27812781
tm.assert_series_equal(result, expected)
2782+
2783+
2784+
def test_unstack_no_fill_complete_data():
2785+
df = DataFrame(
2786+
{"value": [1, 2, 3, 4]},
2787+
index=MultiIndex.from_product([["A", "B"], ["x", "y"]]),
2788+
)
2789+
2790+
result = df.unstack(level=-1, no_fill=True)
2791+
expected = DataFrame(
2792+
[[1, 2], [3, 4]],
2793+
index=["A", "B"],
2794+
columns=MultiIndex.from_tuples([("value", "x"), ("value", "y")]),
2795+
)
2796+
tm.assert_frame_equal(result, expected)
2797+
2798+
2799+
def test_unstack_no_fill_incomplete_data():
2800+
df = DataFrame(
2801+
{"value": [1, 2, 3]},
2802+
index=MultiIndex.from_tuples([("A", "x"), ("A", "y"), ("B", "x")]),
2803+
)
2804+
2805+
# Should raise ValueError when no_fill=True and filling is required
2806+
msg = "Cannot unstack with no_fill=True because filling is required"
2807+
with pytest.raises(ValueError, match=msg):
2808+
df.unstack(level=-1, no_fill=True)
2809+
2810+
2811+
def test_unstack_no_fill_default_behavior():
2812+
df = DataFrame(
2813+
{"value": [1, 2, 3]},
2814+
index=MultiIndex.from_tuples([("A", "x"), ("A", "y"), ("B", "x")]),
2815+
)
2816+
2817+
result = df.unstack(level=-1, no_fill=False)
2818+
expected = DataFrame(
2819+
[[1.0, 2.0], [3.0, np.nan]],
2820+
index=["A", "B"],
2821+
columns=MultiIndex.from_tuples([("value", "x"), ("value", "y")]),
2822+
)
2823+
tm.assert_frame_equal(result, expected)
2824+
2825+
2826+
def test_unstack_no_fill_with_fill_value():
2827+
df = DataFrame(
2828+
{"value": [1, 2, 3]},
2829+
index=MultiIndex.from_tuples([("A", "x"), ("A", "y"), ("B", "x")]),
2830+
)
2831+
2832+
msg = "Cannot unstack with no_fill=True because filling is required"
2833+
with pytest.raises(ValueError, match=msg):
2834+
df.unstack(level=-1, fill_value=0, no_fill=True)

pandas/tests/series/methods/test_unstack.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,3 +167,38 @@ def test_unstack_mixed_level_names():
167167
index=MultiIndex.from_tuples([(1, "red"), (2, "blue")], names=[0, "y"]),
168168
)
169169
tm.assert_frame_equal(result, expected)
170+
171+
172+
def test_unstack_no_fill_complete_data():
173+
index = MultiIndex.from_product([["one", "two"], ["a", "b"]])
174+
ser = Series(np.arange(1.0, 5.0), index=index)
175+
176+
result = ser.unstack(level=-1, no_fill=True)
177+
expected = DataFrame(
178+
[[1.0, 2.0], [3.0, 4.0]],
179+
index=["one", "two"],
180+
columns=["a", "b"],
181+
)
182+
tm.assert_frame_equal(result, expected)
183+
184+
185+
def test_unstack_no_fill_incomplete_data():
186+
index = MultiIndex.from_tuples([("one", "a"), ("one", "b"), ("two", "a")])
187+
ser = Series([1, 2, 3], index=index)
188+
189+
msg = "Cannot unstack with no_fill=True because filling is required"
190+
with pytest.raises(ValueError, match=msg):
191+
ser.unstack(level=-1, no_fill=True)
192+
193+
194+
def test_unstack_no_fill_default_behavior():
195+
index = MultiIndex.from_tuples([("one", "a"), ("one", "b"), ("two", "a")])
196+
ser = Series([1, 2, 3], index=index)
197+
198+
result = ser.unstack(level=-1, no_fill=False)
199+
expected = DataFrame(
200+
[[1.0, 2.0], [3.0, np.nan]],
201+
index=["one", "two"],
202+
columns=["a", "b"],
203+
)
204+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)