Merge branch 'main' into enh-list-arith

jbrockmendel · jbrockmendel · commit ccf56214f37c · 2025-10-10T14:08:02.000-07:00
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
@@ -44,6 +44,7 @@ dependencies:
   - psycopg2>=2.9.10
   - pyarrow>=13.0.0
   - pyiceberg>=0.8.1
+  - pydantic<2.12.0  # TMP pin to avoid pyiceberg/pydantic issues
   - pymysql>=1.1.1
   - pyreadstat>=1.2.8
   - pytables>=3.10.1
diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
@@ -44,6 +44,7 @@ dependencies:
   - psycopg2>=2.9.10
   - pyarrow>=13.0.0
   - pyiceberg>=0.8.1
+  - pydantic<2.12.0  # TMP pin to avoid pyiceberg/pydantic issues
   - pymysql>=1.1.1
   - pyreadstat>=1.2.8
   - pytables>=3.10.1
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -44,7 +44,6 @@
     ValueLabelTypeMismatch,
 )
 from pandas.util._decorators import (
-    Appender,
     doc,
     set_module,
 )
@@ -127,10 +126,6 @@
     Return StataReader object for iterations, returns chunks with
     given number of lines."""
 
-_iterator_params = """\
-iterator : bool, default False
-    Return StataReader object."""
-
 _reader_notes = """\
 Notes
 -----
@@ -139,80 +134,6 @@
 file is associated to an incomplete set of value labels that only
 label a strict subset of the values."""
 
-_read_stata_doc = f"""
-Read Stata file into DataFrame.
-
-Parameters
-----------
-filepath_or_buffer : str, path object or file-like object
-    Any valid string path is acceptable. The string could be a URL. Valid
-    URL schemes include http, ftp, s3, and file. For file URLs, a host is
-    expected. A local file could be: ``file://localhost/path/to/table.dta``.
-
-    If you want to pass in a path object, pandas accepts any ``os.PathLike``.
-
-    By file-like object, we refer to objects with a ``read()`` method,
-    such as a file handle (e.g. via builtin ``open`` function)
-    or ``StringIO``.
-{_statafile_processing_params1}
-{_statafile_processing_params2}
-{_chunksize_params}
-{_iterator_params}
-{_shared_docs["decompression_options"] % "filepath_or_buffer"}
-{_shared_docs["storage_options"]}
-
-Returns
--------
-DataFrame, pandas.api.typing.StataReader
-    If iterator or chunksize, returns StataReader, else DataFrame.
-
-See Also
---------
-io.stata.StataReader : Low-level reader for Stata data files.
-DataFrame.to_stata: Export Stata data files.
-
-{_reader_notes}
-
-Examples
---------
-
-Creating a dummy stata for this example
-
->>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 'parrot'],
-...                   'speed': [350, 18, 361, 15]}})  # doctest: +SKIP
->>> df.to_stata('animals.dta')  # doctest: +SKIP
-
-Read a Stata dta file:
-
->>> df = pd.read_stata('animals.dta')  # doctest: +SKIP
-
-Read a Stata dta file in 10,000 line chunks:
-
->>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8")  # doctest: +SKIP
->>> df = pd.DataFrame(values, columns=["i"])  # doctest: +SKIP
->>> df.to_stata('filename.dta')  # doctest: +SKIP
-
->>> with pd.read_stata('filename.dta', chunksize=10000) as itr:  # doctest: +SKIP
->>>     for chunk in itr:
-...         # Operate on a single chunk, e.g., chunk.mean()
-...         pass  # doctest: +SKIP
-"""
-
-_read_method_doc = f"""\
-Reads observations from Stata file, converting them into a dataframe
-
-Parameters
-----------
-nrows : int
-    Number of lines to read from data file, if None read whole file.
-{_statafile_processing_params1}
-{_statafile_processing_params2}
-
-Returns
--------
-DataFrame
-"""
-
 _stata_reader_doc = f"""\
 Class for reading Stata dta files.
 
@@ -1677,7 +1598,6 @@ def get_chunk(self, size: int | None = None) -> DataFrame:
             size = self._chunksize
         return self.read(nrows=size)
 
-    @Appender(_read_method_doc)
     def read(
         self,
         nrows: int | None = None,
@@ -1689,6 +1609,38 @@ def read(
         columns: Sequence[str] | None = None,
         order_categoricals: bool | None = None,
     ) -> DataFrame:
+        """
+        Reads observations from Stata file, converting them into a dataframe
+
+        Parameters
+        ----------
+        nrows : int
+            Number of lines to read from data file, if None read whole file.
+        convert_dates : bool, default True
+            Convert date variables to DataFrame time values.
+        convert_categoricals : bool, default True
+            Read value labels and convert columns to Categorical/Factor variables.
+        index_col : str, optional
+            Column to set as index.
+        convert_missing : bool, default False
+            Flag indicating whether to convert missing values to their Stata
+            representations.  If False, missing values are replaced with nan.
+            If True, columns containing missing values are returned with
+            object data types and missing values are represented by
+            StataMissingValue objects.
+        preserve_dtypes : bool, default True
+            Preserve Stata datatypes. If False, numeric data are upcast to pandas
+            default types for foreign data (float64 or int64).
+        columns : list or None
+            Columns to retain.  Columns will be returned in the given order.  None
+            returns all columns.
+        order_categoricals : bool, default True
+            Flag indicating whether converted categorical data are ordered.
+
+        Returns
+        -------
+        DataFrame
+        """
         self._ensure_open()
 
         # Handle options
@@ -2135,7 +2087,6 @@ def value_labels(self) -> dict[str, dict[int, str]]:
 
 
 @set_module("pandas")
-@Appender(_read_stata_doc)
 def read_stata(
     filepath_or_buffer: FilePath | ReadBuffer[bytes],
     *,
@@ -2151,6 +2102,122 @@ def read_stata(
     compression: CompressionOptions = "infer",
     storage_options: StorageOptions | None = None,
 ) -> DataFrame | StataReader:
+    """
+    Read Stata file into DataFrame.
+
+    Parameters
+    ----------
+    filepath_or_buffer : str, path object or file-like object
+        Any valid string path is acceptable. The string could be a URL. Valid
+        URL schemes include http, ftp, s3, and file. For file URLs, a host is
+        expected. A local file could be: ``file://localhost/path/to/table.dta``.
+
+        If you want to pass in a path object, pandas accepts any ``os.PathLike``.
+
+        By file-like object, we refer to objects with a ``read()`` method,
+        such as a file handle (e.g. via builtin ``open`` function)
+        or ``StringIO``.
+    convert_dates : bool, default True
+        Convert date variables to DataFrame time values.
+    convert_categoricals : bool, default True
+        Read value labels and convert columns to Categorical/Factor variables.
+    index_col : str, optional
+        Column to set as index.
+    convert_missing : bool, default False
+        Flag indicating whether to convert missing values to their Stata
+        representations.  If False, missing values are replaced with nan.
+        If True, columns containing missing values are returned with
+        object data types and missing values are represented by
+        StataMissingValue objects.
+    preserve_dtypes : bool, default True
+        Preserve Stata datatypes. If False, numeric data are upcast to pandas
+        default types for foreign data (float64 or int64).
+    columns : list or None
+        Columns to retain.  Columns will be returned in the given order.  None
+        returns all columns.
+    order_categoricals : bool, default True
+        Flag indicating whether converted categorical data are ordered.
+    chunksize : int, default None
+        Return StataReader object for iterations, returns chunks with
+        given number of lines.
+    iterator : bool, default False
+        Return StataReader object.
+    compression : str or dict, default 'infer'
+        For on-the-fly decompression of on-disk data. If 'infer' and
+        'filepath_or_buffer' is path-like, then detect compression from the
+        following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
+        '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
+        If using 'zip' or 'tar', the ZIP file must contain only one
+        data file to be read in. Set to ``None`` for no decompression.
+        Can also be a dict with key ``'method'`` set to one of
+        {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
+        other key-value pairs are forwarded to
+        ``zipfile.ZipFile``, ``gzip.GzipFile``,
+        ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or
+        ``tarfile.TarFile``, respectively.
+        As an example, the following could be passed for Zstandard decompression using a
+        custom compression dictionary:
+        ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
+
+        .. versionadded:: 1.5.0
+            Added support for `.tar` files.
+    storage_options : dict, optional
+        Extra options that make sense for a particular storage connection, e.g.
+        host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+        are forwarded to ``urllib.request.Request`` as header options. For other
+        URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+        forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+        details, and for more examples on storage options refer `here
+        <https://pandas.pydata.org/docs/user_guide/io.html?
+        highlight=storage_options#reading-writing-remote-files>`_.
+
+    Returns
+    -------
+    DataFrame, pandas.api.typing.StataReader
+        If iterator or chunksize, returns StataReader, else DataFrame.
+
+    See Also
+    --------
+    io.stata.StataReader : Low-level reader for Stata data files.
+    DataFrame.to_stata: Export Stata data files.
+
+    Notes
+    -----
+    Categorical variables read through an iterator may not have the same
+    categories and dtype. This occurs when  a variable stored in a DTA
+    file is associated to an incomplete set of value labels that only
+    label a strict subset of the values.
+
+    Examples
+    --------
+
+    Creating a dummy stata for this example
+
+    >>> df = pd.DataFrame(
+    ...     {
+    ...         "animal": ["falcon", "parrot", "falcon", "parrot"],
+    ...         "speed": [350, 18, 361, 15],
+    ...     }
+    ... )  # doctest: +SKIP
+    >>> df.to_stata("animals.dta")  # doctest: +SKIP
+
+    Read a Stata dta file:
+
+    >>> df = pd.read_stata("animals.dta")  # doctest: +SKIP
+
+    Read a Stata dta file in 10,000 line chunks:
+
+    >>> values = np.random.randint(
+    ...     0, 10, size=(20_000, 1), dtype="uint8"
+    ... )  # doctest: +SKIP
+    >>> df = pd.DataFrame(values, columns=["i"])  # doctest: +SKIP
+    >>> df.to_stata("filename.dta")  # doctest: +SKIP
+
+    >>> with pd.read_stata('filename.dta', chunksize=10000) as itr:  # doctest: +SKIP
+    >>>     for chunk in itr:
+    ...         # Operate on a single chunk, e.g., chunk.mean()
+    ...         pass  # doctest: +SKIP
+    """
     reader = StataReader(
         filepath_or_buffer,
         convert_dates=convert_dates,
diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py
@@ -185,7 +185,7 @@ def test_map(idx):
 @pytest.mark.parametrize(
     "mapper",
     [
-        lambda values, idx: {i: e for e, i in zip(values, idx)},
+        lambda values, idx: {i: e for e, i in zip(values, idx, strict=True)},
         lambda values, idx: pd.Series(values, idx),
     ],
 )
diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py
@@ -155,7 +155,7 @@ def test_copy_in_constructor():
 def test_from_arrays(idx):
     arrays = [
         np.asarray(lev).take(level_codes)
-        for lev, level_codes in zip(idx.levels, idx.codes)
+        for lev, level_codes in zip(idx.levels, idx.codes, strict=True)
     ]
 
     # list of arrays as input
@@ -172,7 +172,7 @@ def test_from_arrays_iterator(idx):
     # GH 18434
     arrays = [
         np.asarray(lev).take(level_codes)
-        for lev, level_codes in zip(idx.levels, idx.codes)
+        for lev, level_codes in zip(idx.levels, idx.codes, strict=True)
     ]
 
     # iterator as input
@@ -188,7 +188,7 @@ def test_from_arrays_iterator(idx):
 def test_from_arrays_tuples(idx):
     arrays = tuple(
         tuple(np.asarray(lev).take(level_codes))
-        for lev, level_codes in zip(idx.levels, idx.codes)
+        for lev, level_codes in zip(idx.levels, idx.codes, strict=True)
     )
 
     # tuple of tuples as input
@@ -368,7 +368,7 @@ def test_from_tuples_iterator():
         levels=[[1, 3], [2, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"]
     )
 
-    result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=["a", "b"])
+    result = MultiIndex.from_tuples(zip([1, 3], [2, 4], strict=True), names=["a", "b"])
     tm.assert_index_equal(result, expected)
 
     # input non-iterables
diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py
@@ -223,7 +223,7 @@ def test_equals_missing_values_differently_sorted():
 
 
 def test_is_():
-    mi = MultiIndex.from_tuples(zip(range(10), range(10)))
+    mi = MultiIndex.from_tuples(zip(range(10), range(10), strict=True))
     assert mi.is_(mi)
     assert mi.is_(mi.view())
     assert mi.is_(mi.view().view().view().view())
diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py
@@ -15,7 +15,7 @@ def assert_matching(actual, expected, check_dtype=False):
     # avoid specifying internal representation
     # as much as possible
     assert len(actual) == len(expected)
-    for act, exp in zip(actual, expected):
+    for act, exp in zip(actual, expected, strict=True):
         act = np.asarray(act)
         exp = np.asarray(exp)
         tm.assert_numpy_array_equal(act, exp, check_dtype=check_dtype)
diff --git a/pyproject.toml b/pyproject.toml
@@ -520,10 +520,6 @@ exclude = [
 "pandas/tests/indexes/interval/test_constructors.py" = ["B905"]
 "pandas/tests/indexes/interval/test_formats.py" = ["B905"]
 "pandas/tests/indexes/interval/test_interval.py" = ["B905"]
-"pandas/tests/indexes/multi/test_analytics.py" = ["B905"]
-"pandas/tests/indexes/multi/test_constructors.py" = ["B905"]
-"pandas/tests/indexes/multi/test_equivalence.py" = ["B905"]
-"pandas/tests/indexes/multi/test_get_set.py" = ["B905"]
 "pandas/tests/indexes/period/methods/test_asfreq.py" = ["B905"]
 "pandas/tests/indexes/period/test_constructors.py" = ["B905"]
 "pandas/tests/indexes/period/test_formats.py" = ["B905"]

Original file line number	Diff line number	Diff line change
`@@ -185,7 +185,7 @@ def test_map(idx):`
`185`	`185`	`@pytest.mark.parametrize(`
`186`	`186`	`"mapper",`
`187`	`187`	`[`
`188`		`- lambda values, idx: {i: e for e, i in zip(values, idx)},`
	`188`	`+ lambda values, idx: {i: e for e, i in zip(values, idx, strict=True)},`
`189`	`189`	`lambda values, idx: pd.Series(values, idx),`
`190`	`190`	`],`
`191`	`191`	`)`