From 095778b85d0981cd36ae6c75bd8a3d25a4a0cc0a Mon Sep 17 00:00:00 2001 From: "Mien (Josephine) Nguyen" Date: Sat, 27 Sep 2025 16:44:01 -0400 Subject: [PATCH 1/4] add strict param --- pandas/core/reshape/concat.py | 8 ++++---- pandas/core/reshape/encoding.py | 14 ++++++++++---- pandas/core/reshape/melt.py | 2 +- pandas/core/reshape/merge.py | 18 +++++++++--------- pandas/core/reshape/pivot.py | 4 ++-- pandas/core/reshape/reshape.py | 12 +++++++----- 6 files changed, 33 insertions(+), 25 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index e39c716784455..db7f33d5c017f 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -840,7 +840,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde if (levels is None and isinstance(keys[0], tuple)) or ( levels is not None and len(levels) > 1 ): - zipped = list(zip(*keys)) + zipped = list(zip(*keys, strict=True)) if names is None: names = [None] * len(zipped) @@ -866,13 +866,13 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde # things are potentially different sizes, so compute the exact codes # for each level and pass those to MultiIndex.from_arrays - for hlevel, level in zip(zipped, levels): + for hlevel, level in zip(zipped, levels, strict=True): to_concat = [] if isinstance(hlevel, Index) and hlevel.equals(level): lens = [len(idx) for idx in indexes] codes_list.append(np.repeat(np.arange(len(hlevel)), lens)) else: - for key, index in zip(hlevel, indexes): + for key, index in zip(hlevel, indexes, strict=True): # Find matching codes, include matching nan values as equal. mask = (isna(level) & isna(key)) | (level == key) if not mask.any(): @@ -922,7 +922,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde # do something a bit more speedy - for hlevel, level in zip(zipped, levels): + for hlevel, level in zip(zipped, levels, strict=True): hlevel_index = ensure_index(hlevel) mapped = level.get_indexer(hlevel_index) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 67fb075110f0d..3e71724a70606 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -209,7 +209,9 @@ def check_len(item, name: str) -> None: # columns to prepend to result. with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] - for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep): + for col, pre, sep in zip( + data_to_encode.items(), prefix, prefix_sep, strict=False + ): # col is (column_name, column), use just column data here dummy = _get_dummies_1d( col[1], @@ -323,7 +325,7 @@ def get_empty_frame(data) -> DataFrame: codes = codes[mask] n_idx = np.arange(N)[mask] - for ndx, code in zip(n_idx, codes): + for ndx, code in zip(n_idx, codes, strict=True): sp_indices[code].append(ndx) if drop_first: @@ -331,7 +333,7 @@ def get_empty_frame(data) -> DataFrame: # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] - for col, ixs in zip(dummy_cols, sp_indices): + for col, ixs in zip(dummy_cols, sp_indices, strict=True): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), @@ -535,7 +537,11 @@ def from_dummies( raise ValueError(len_msg) elif isinstance(default_category, Hashable): default_category = dict( - zip(variables_slice, [default_category] * len(variables_slice)) + zip( + variables_slice, + [default_category] * len(variables_slice), + strict=True, + ) ) else: raise TypeError( diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 5d4b15c9a0ca3..aeab833878583 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -199,7 +199,7 @@ def melt( missing = idx == -1 if missing.any(): missing_labels = [ - lab for lab, not_found in zip(labels, missing) if not_found + lab for lab, not_found in zip(labels, missing, strict=True) if not_found ] raise KeyError( "The following id_vars or value_vars are not present in " diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f35b0ef197288..941b7fd6d2d42 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1230,7 +1230,7 @@ def _maybe_restore_index_levels(self, result: DataFrame) -> None: """ names_to_restore = [] for name, left_key, right_key in zip( - self.join_names, self.left_on, self.right_on + self.join_names, self.left_on, self.right_on, strict=True ): if ( # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible @@ -1263,7 +1263,7 @@ def _maybe_add_join_keys( assert all(isinstance(x, _known) for x in self.left_join_keys) - keys = zip(self.join_names, self.left_on, self.right_on) + keys = zip(self.join_names, self.left_on, self.right_on, strict=True) for i, (name, lname, rname) in enumerate(keys): if not _should_fill(lname, rname): continue @@ -1572,7 +1572,7 @@ def _get_merge_keys( # ugh, spaghetti re #733 if _any(self.left_on) and _any(self.right_on): - for lk, rk in zip(self.left_on, self.right_on): + for lk, rk in zip(self.left_on, self.right_on, strict=True): lk = extract_array(lk, extract_numpy=True) rk = extract_array(rk, extract_numpy=True) if is_lkey(lk): @@ -1635,7 +1635,7 @@ def _get_merge_keys( right_keys = [ lev._values.take(lev_codes) for lev, lev_codes in zip( - self.right.index.levels, self.right.index.codes + self.right.index.levels, self.right.index.codes, strict=True ) ] else: @@ -1657,7 +1657,7 @@ def _get_merge_keys( left_keys = [ lev._values.take(lev_codes) for lev, lev_codes in zip( - self.left.index.levels, self.left.index.codes + self.left.index.levels, self.left.index.codes, strict=True ) ] else: @@ -1674,7 +1674,7 @@ def _maybe_coerce_merge_keys(self) -> None: # or if we have object and integer dtypes for lk, rk, name in zip( - self.left_join_keys, self.right_join_keys, self.join_names + self.left_join_keys, self.right_join_keys, self.join_names, strict=True ): if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): continue @@ -2042,7 +2042,7 @@ def get_join_indexers( _factorize_keys(left_keys[n], right_keys[n], sort=sort) for n in range(len(left_keys)) ) - zipped = zip(*mapped) + zipped = zip(*mapped, strict=True) llab, rlab, shape = (list(x) for x in zipped) # get flat i8 keys from label lists @@ -2427,7 +2427,7 @@ def _check_dtype_match(left: ArrayLike, right: ArrayLike, i: int) -> None: raise MergeError(msg) # validate index types are the same - for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): + for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys, strict=True)): _check_dtype_match(lk, rk, i) if self.left_index: @@ -2612,7 +2612,7 @@ def _get_multiindex_indexer( _factorize_keys(index.levels[n]._values, join_keys[n], sort=sort) for n in range(index.nlevels) ) - zipped = zip(*mapped) + zipped = zip(*mapped, strict=True) rcodes, lcodes, shape = (list(x) for x in zipped) if sort: rcodes = list(map(np.take, rcodes, index.codes)) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 04c584c226aed..30b96c047aab8 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1098,8 +1098,8 @@ def crosstab( from pandas import DataFrame data = { - **dict(zip(unique_rownames, index)), - **dict(zip(unique_colnames, columns)), + **dict(zip(unique_rownames, index, strict=True)), + **dict(zip(unique_colnames, columns, strict=True)), } df = DataFrame(data, index=common_idx) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c14389d753aac..02fd261176a17 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -696,7 +696,9 @@ def stack_factorize(index): levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) else: - levels, (ilab, clab) = zip(*map(stack_factorize, (frame.index, frame.columns))) + levels, (ilab, clab) = zip( + *map(stack_factorize, (frame.index, frame.columns)), strict=True + ) codes = ilab.repeat(K), np.tile(clab, N).ravel() new_index = MultiIndex( levels=levels, @@ -778,13 +780,13 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index: levs = ( [lev[c] if c >= 0 else None for c in codes] - for lev, codes in zip(columns.levels[:-1], columns.codes[:-1]) + for lev, codes in zip(columns.levels[:-1], columns.codes[:-1], strict=True) ) # Remove duplicate tuples in the MultiIndex. - tuples = zip(*levs) + tuples = zip(*levs, strict=True) unique_tuples = (key for key, _ in itertools.groupby(tuples)) - new_levs = zip(*unique_tuples) + new_levs = zip(*unique_tuples, strict=True) # The dtype of each level must be explicitly set to avoid inferring the wrong type. # See GH-36991. @@ -792,7 +794,7 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index: [ # Not all indices can accept None values. Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev - for new_lev, lev in zip(new_levs, columns.levels) + for new_lev, lev in zip(new_levs, columns.levels, strict=True) ], names=columns.names[:-1], ) From d10ed5a709a6e0423fdf7b99ad4403a751bba435 Mon Sep 17 00:00:00 2001 From: "Mien (Josephine) Nguyen" Date: Sun, 19 Oct 2025 16:15:24 -0400 Subject: [PATCH 2/4] Fix tests --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 02fd261176a17..924bd27132f32 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -794,7 +794,7 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index: [ # Not all indices can accept None values. Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev - for new_lev, lev in zip(new_levs, columns.levels, strict=True) + for new_lev, lev in zip(new_levs, columns.levels) ], names=columns.names[:-1], ) From 995082293320445670b8665db908808344dfbc66 Mon Sep 17 00:00:00 2001 From: "Mien (Josephine) Nguyen" Date: Sun, 19 Oct 2025 18:24:22 -0400 Subject: [PATCH 3/4] Fix tests + pyproject.toml --- pandas/core/reshape/reshape.py | 4 ++-- pyproject.toml | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 924bd27132f32..1c7de975711bc 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -780,7 +780,7 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index: levs = ( [lev[c] if c >= 0 else None for c in codes] - for lev, codes in zip(columns.levels[:-1], columns.codes[:-1], strict=True) + for lev, codes in zip(columns.levels[:-1], columns.codes[:-1], strict=False) ) # Remove duplicate tuples in the MultiIndex. @@ -794,7 +794,7 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index: [ # Not all indices can accept None values. Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev - for new_lev, lev in zip(new_levs, columns.levels) + for new_lev, lev in zip(new_levs, columns.levels, strict=False) ], names=columns.names[:-1], ) diff --git a/pyproject.toml b/pyproject.toml index 094d0b44a6721..bdf84f01f4dfd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -453,12 +453,6 @@ exclude = [ "pandas/core/groupby/grouper.py" = ["B905"] "pandas/core/groupby/ops.py" = ["B905"] "pandas/core/methods/to_dict.py" = ["B905"] -"pandas/core/reshape/concat.py" = ["B905"] -"pandas/core/reshape/encoding.py" = ["B905"] -"pandas/core/reshape/melt.py" = ["B905"] -"pandas/core/reshape/merge.py" = ["B905"] -"pandas/core/reshape/pivot.py" = ["B905"] -"pandas/core/reshape/reshape.py" = ["B905"] "pandas/core/window/rolling.py" = ["B905"] "pandas/_testing/asserters.py" = ["B905"] "pandas/_testing/_warnings.py" = ["B905"] From e31820d894a18304f1775d5d1ae5296c905a4f5f Mon Sep 17 00:00:00 2001 From: "Mien (Josephine) Nguyen" Date: Mon, 3 Nov 2025 17:07:58 -0500 Subject: [PATCH 4/4] PR fixes --- pandas/core/reshape/encoding.py | 10 +++++++--- pandas/core/reshape/reshape.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 4c9cd10723444..2220a311c2bf0 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -183,7 +183,9 @@ def check_len(item, name: str) -> None: check_len(prefix_sep, "prefix_sep") if isinstance(prefix, str): - prefix = itertools.cycle([prefix]) + prefix = itertools.islice( + itertools.cycle([prefix]), data_to_encode.shape[1] + ) if isinstance(prefix, dict): prefix = [prefix[col] for col in data_to_encode.columns] @@ -192,7 +194,9 @@ def check_len(item, name: str) -> None: # validate separators if isinstance(prefix_sep, str): - prefix_sep = itertools.cycle([prefix_sep]) + prefix_sep = itertools.islice( + itertools.cycle([prefix_sep]), data_to_encode.shape[1] + ) elif isinstance(prefix_sep, dict): prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] @@ -210,7 +214,7 @@ def check_len(item, name: str) -> None: with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] for col, pre, sep in zip( - data_to_encode.items(), prefix, prefix_sep, strict=False + data_to_encode.items(), prefix, prefix_sep, strict=True ): # col is (column_name, column), use just column data here dummy = _get_dummies_1d( diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 1c7de975711bc..dbed6abbbcb50 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -780,7 +780,7 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index: levs = ( [lev[c] if c >= 0 else None for c in codes] - for lev, codes in zip(columns.levels[:-1], columns.codes[:-1], strict=False) + for lev, codes in zip(columns.levels[:-1], columns.codes[:-1], strict=True) ) # Remove duplicate tuples in the MultiIndex.