Skip to content

Commit ccf5621

Browse files
committed
Merge branch 'main' into enh-list-arith
2 parents 35c85ab + 1863adb commit ccf5621

File tree

8 files changed

+157
-92
lines changed

8 files changed

+157
-92
lines changed

ci/deps/actions-311.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ dependencies:
4444
- psycopg2>=2.9.10
4545
- pyarrow>=13.0.0
4646
- pyiceberg>=0.8.1
47+
- pydantic<2.12.0 # TMP pin to avoid pyiceberg/pydantic issues
4748
- pymysql>=1.1.1
4849
- pyreadstat>=1.2.8
4950
- pytables>=3.10.1

ci/deps/actions-312.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ dependencies:
4444
- psycopg2>=2.9.10
4545
- pyarrow>=13.0.0
4646
- pyiceberg>=0.8.1
47+
- pydantic<2.12.0 # TMP pin to avoid pyiceberg/pydantic issues
4748
- pymysql>=1.1.1
4849
- pyreadstat>=1.2.8
4950
- pytables>=3.10.1

pandas/io/stata.py

Lines changed: 148 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@
4444
ValueLabelTypeMismatch,
4545
)
4646
from pandas.util._decorators import (
47-
Appender,
4847
doc,
4948
set_module,
5049
)
@@ -127,10 +126,6 @@
127126
Return StataReader object for iterations, returns chunks with
128127
given number of lines."""
129128

130-
_iterator_params = """\
131-
iterator : bool, default False
132-
Return StataReader object."""
133-
134129
_reader_notes = """\
135130
Notes
136131
-----
@@ -139,80 +134,6 @@
139134
file is associated to an incomplete set of value labels that only
140135
label a strict subset of the values."""
141136

142-
_read_stata_doc = f"""
143-
Read Stata file into DataFrame.
144-
145-
Parameters
146-
----------
147-
filepath_or_buffer : str, path object or file-like object
148-
Any valid string path is acceptable. The string could be a URL. Valid
149-
URL schemes include http, ftp, s3, and file. For file URLs, a host is
150-
expected. A local file could be: ``file://localhost/path/to/table.dta``.
151-
152-
If you want to pass in a path object, pandas accepts any ``os.PathLike``.
153-
154-
By file-like object, we refer to objects with a ``read()`` method,
155-
such as a file handle (e.g. via builtin ``open`` function)
156-
or ``StringIO``.
157-
{_statafile_processing_params1}
158-
{_statafile_processing_params2}
159-
{_chunksize_params}
160-
{_iterator_params}
161-
{_shared_docs["decompression_options"] % "filepath_or_buffer"}
162-
{_shared_docs["storage_options"]}
163-
164-
Returns
165-
-------
166-
DataFrame, pandas.api.typing.StataReader
167-
If iterator or chunksize, returns StataReader, else DataFrame.
168-
169-
See Also
170-
--------
171-
io.stata.StataReader : Low-level reader for Stata data files.
172-
DataFrame.to_stata: Export Stata data files.
173-
174-
{_reader_notes}
175-
176-
Examples
177-
--------
178-
179-
Creating a dummy stata for this example
180-
181-
>>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 'parrot'],
182-
... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP
183-
>>> df.to_stata('animals.dta') # doctest: +SKIP
184-
185-
Read a Stata dta file:
186-
187-
>>> df = pd.read_stata('animals.dta') # doctest: +SKIP
188-
189-
Read a Stata dta file in 10,000 line chunks:
190-
191-
>>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP
192-
>>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP
193-
>>> df.to_stata('filename.dta') # doctest: +SKIP
194-
195-
>>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP
196-
>>> for chunk in itr:
197-
... # Operate on a single chunk, e.g., chunk.mean()
198-
... pass # doctest: +SKIP
199-
"""
200-
201-
_read_method_doc = f"""\
202-
Reads observations from Stata file, converting them into a dataframe
203-
204-
Parameters
205-
----------
206-
nrows : int
207-
Number of lines to read from data file, if None read whole file.
208-
{_statafile_processing_params1}
209-
{_statafile_processing_params2}
210-
211-
Returns
212-
-------
213-
DataFrame
214-
"""
215-
216137
_stata_reader_doc = f"""\
217138
Class for reading Stata dta files.
218139
@@ -1677,7 +1598,6 @@ def get_chunk(self, size: int | None = None) -> DataFrame:
16771598
size = self._chunksize
16781599
return self.read(nrows=size)
16791600

1680-
@Appender(_read_method_doc)
16811601
def read(
16821602
self,
16831603
nrows: int | None = None,
@@ -1689,6 +1609,38 @@ def read(
16891609
columns: Sequence[str] | None = None,
16901610
order_categoricals: bool | None = None,
16911611
) -> DataFrame:
1612+
"""
1613+
Reads observations from Stata file, converting them into a dataframe
1614+
1615+
Parameters
1616+
----------
1617+
nrows : int
1618+
Number of lines to read from data file, if None read whole file.
1619+
convert_dates : bool, default True
1620+
Convert date variables to DataFrame time values.
1621+
convert_categoricals : bool, default True
1622+
Read value labels and convert columns to Categorical/Factor variables.
1623+
index_col : str, optional
1624+
Column to set as index.
1625+
convert_missing : bool, default False
1626+
Flag indicating whether to convert missing values to their Stata
1627+
representations. If False, missing values are replaced with nan.
1628+
If True, columns containing missing values are returned with
1629+
object data types and missing values are represented by
1630+
StataMissingValue objects.
1631+
preserve_dtypes : bool, default True
1632+
Preserve Stata datatypes. If False, numeric data are upcast to pandas
1633+
default types for foreign data (float64 or int64).
1634+
columns : list or None
1635+
Columns to retain. Columns will be returned in the given order. None
1636+
returns all columns.
1637+
order_categoricals : bool, default True
1638+
Flag indicating whether converted categorical data are ordered.
1639+
1640+
Returns
1641+
-------
1642+
DataFrame
1643+
"""
16921644
self._ensure_open()
16931645

16941646
# Handle options
@@ -2135,7 +2087,6 @@ def value_labels(self) -> dict[str, dict[int, str]]:
21352087

21362088

21372089
@set_module("pandas")
2138-
@Appender(_read_stata_doc)
21392090
def read_stata(
21402091
filepath_or_buffer: FilePath | ReadBuffer[bytes],
21412092
*,
@@ -2151,6 +2102,122 @@ def read_stata(
21512102
compression: CompressionOptions = "infer",
21522103
storage_options: StorageOptions | None = None,
21532104
) -> DataFrame | StataReader:
2105+
"""
2106+
Read Stata file into DataFrame.
2107+
2108+
Parameters
2109+
----------
2110+
filepath_or_buffer : str, path object or file-like object
2111+
Any valid string path is acceptable. The string could be a URL. Valid
2112+
URL schemes include http, ftp, s3, and file. For file URLs, a host is
2113+
expected. A local file could be: ``file://localhost/path/to/table.dta``.
2114+
2115+
If you want to pass in a path object, pandas accepts any ``os.PathLike``.
2116+
2117+
By file-like object, we refer to objects with a ``read()`` method,
2118+
such as a file handle (e.g. via builtin ``open`` function)
2119+
or ``StringIO``.
2120+
convert_dates : bool, default True
2121+
Convert date variables to DataFrame time values.
2122+
convert_categoricals : bool, default True
2123+
Read value labels and convert columns to Categorical/Factor variables.
2124+
index_col : str, optional
2125+
Column to set as index.
2126+
convert_missing : bool, default False
2127+
Flag indicating whether to convert missing values to their Stata
2128+
representations. If False, missing values are replaced with nan.
2129+
If True, columns containing missing values are returned with
2130+
object data types and missing values are represented by
2131+
StataMissingValue objects.
2132+
preserve_dtypes : bool, default True
2133+
Preserve Stata datatypes. If False, numeric data are upcast to pandas
2134+
default types for foreign data (float64 or int64).
2135+
columns : list or None
2136+
Columns to retain. Columns will be returned in the given order. None
2137+
returns all columns.
2138+
order_categoricals : bool, default True
2139+
Flag indicating whether converted categorical data are ordered.
2140+
chunksize : int, default None
2141+
Return StataReader object for iterations, returns chunks with
2142+
given number of lines.
2143+
iterator : bool, default False
2144+
Return StataReader object.
2145+
compression : str or dict, default 'infer'
2146+
For on-the-fly decompression of on-disk data. If 'infer' and
2147+
'filepath_or_buffer' is path-like, then detect compression from the
2148+
following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
2149+
'.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
2150+
If using 'zip' or 'tar', the ZIP file must contain only one
2151+
data file to be read in. Set to ``None`` for no decompression.
2152+
Can also be a dict with key ``'method'`` set to one of
2153+
{``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
2154+
other key-value pairs are forwarded to
2155+
``zipfile.ZipFile``, ``gzip.GzipFile``,
2156+
``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or
2157+
``tarfile.TarFile``, respectively.
2158+
As an example, the following could be passed for Zstandard decompression using a
2159+
custom compression dictionary:
2160+
``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
2161+
2162+
.. versionadded:: 1.5.0
2163+
Added support for `.tar` files.
2164+
storage_options : dict, optional
2165+
Extra options that make sense for a particular storage connection, e.g.
2166+
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
2167+
are forwarded to ``urllib.request.Request`` as header options. For other
2168+
URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
2169+
forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
2170+
details, and for more examples on storage options refer `here
2171+
<https://pandas.pydata.org/docs/user_guide/io.html?
2172+
highlight=storage_options#reading-writing-remote-files>`_.
2173+
2174+
Returns
2175+
-------
2176+
DataFrame, pandas.api.typing.StataReader
2177+
If iterator or chunksize, returns StataReader, else DataFrame.
2178+
2179+
See Also
2180+
--------
2181+
io.stata.StataReader : Low-level reader for Stata data files.
2182+
DataFrame.to_stata: Export Stata data files.
2183+
2184+
Notes
2185+
-----
2186+
Categorical variables read through an iterator may not have the same
2187+
categories and dtype. This occurs when a variable stored in a DTA
2188+
file is associated to an incomplete set of value labels that only
2189+
label a strict subset of the values.
2190+
2191+
Examples
2192+
--------
2193+
2194+
Creating a dummy stata for this example
2195+
2196+
>>> df = pd.DataFrame(
2197+
... {
2198+
... "animal": ["falcon", "parrot", "falcon", "parrot"],
2199+
... "speed": [350, 18, 361, 15],
2200+
... }
2201+
... ) # doctest: +SKIP
2202+
>>> df.to_stata("animals.dta") # doctest: +SKIP
2203+
2204+
Read a Stata dta file:
2205+
2206+
>>> df = pd.read_stata("animals.dta") # doctest: +SKIP
2207+
2208+
Read a Stata dta file in 10,000 line chunks:
2209+
2210+
>>> values = np.random.randint(
2211+
... 0, 10, size=(20_000, 1), dtype="uint8"
2212+
... ) # doctest: +SKIP
2213+
>>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP
2214+
>>> df.to_stata("filename.dta") # doctest: +SKIP
2215+
2216+
>>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP
2217+
>>> for chunk in itr:
2218+
... # Operate on a single chunk, e.g., chunk.mean()
2219+
... pass # doctest: +SKIP
2220+
"""
21542221
reader = StataReader(
21552222
filepath_or_buffer,
21562223
convert_dates=convert_dates,

pandas/tests/indexes/multi/test_analytics.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def test_map(idx):
185185
@pytest.mark.parametrize(
186186
"mapper",
187187
[
188-
lambda values, idx: {i: e for e, i in zip(values, idx)},
188+
lambda values, idx: {i: e for e, i in zip(values, idx, strict=True)},
189189
lambda values, idx: pd.Series(values, idx),
190190
],
191191
)

pandas/tests/indexes/multi/test_constructors.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def test_copy_in_constructor():
155155
def test_from_arrays(idx):
156156
arrays = [
157157
np.asarray(lev).take(level_codes)
158-
for lev, level_codes in zip(idx.levels, idx.codes)
158+
for lev, level_codes in zip(idx.levels, idx.codes, strict=True)
159159
]
160160

161161
# list of arrays as input
@@ -172,7 +172,7 @@ def test_from_arrays_iterator(idx):
172172
# GH 18434
173173
arrays = [
174174
np.asarray(lev).take(level_codes)
175-
for lev, level_codes in zip(idx.levels, idx.codes)
175+
for lev, level_codes in zip(idx.levels, idx.codes, strict=True)
176176
]
177177

178178
# iterator as input
@@ -188,7 +188,7 @@ def test_from_arrays_iterator(idx):
188188
def test_from_arrays_tuples(idx):
189189
arrays = tuple(
190190
tuple(np.asarray(lev).take(level_codes))
191-
for lev, level_codes in zip(idx.levels, idx.codes)
191+
for lev, level_codes in zip(idx.levels, idx.codes, strict=True)
192192
)
193193

194194
# tuple of tuples as input
@@ -368,7 +368,7 @@ def test_from_tuples_iterator():
368368
levels=[[1, 3], [2, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"]
369369
)
370370

371-
result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=["a", "b"])
371+
result = MultiIndex.from_tuples(zip([1, 3], [2, 4], strict=True), names=["a", "b"])
372372
tm.assert_index_equal(result, expected)
373373

374374
# input non-iterables

pandas/tests/indexes/multi/test_equivalence.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def test_equals_missing_values_differently_sorted():
223223

224224

225225
def test_is_():
226-
mi = MultiIndex.from_tuples(zip(range(10), range(10)))
226+
mi = MultiIndex.from_tuples(zip(range(10), range(10), strict=True))
227227
assert mi.is_(mi)
228228
assert mi.is_(mi.view())
229229
assert mi.is_(mi.view().view().view().view())

pandas/tests/indexes/multi/test_get_set.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def assert_matching(actual, expected, check_dtype=False):
1515
# avoid specifying internal representation
1616
# as much as possible
1717
assert len(actual) == len(expected)
18-
for act, exp in zip(actual, expected):
18+
for act, exp in zip(actual, expected, strict=True):
1919
act = np.asarray(act)
2020
exp = np.asarray(exp)
2121
tm.assert_numpy_array_equal(act, exp, check_dtype=check_dtype)

pyproject.toml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -520,10 +520,6 @@ exclude = [
520520
"pandas/tests/indexes/interval/test_constructors.py" = ["B905"]
521521
"pandas/tests/indexes/interval/test_formats.py" = ["B905"]
522522
"pandas/tests/indexes/interval/test_interval.py" = ["B905"]
523-
"pandas/tests/indexes/multi/test_analytics.py" = ["B905"]
524-
"pandas/tests/indexes/multi/test_constructors.py" = ["B905"]
525-
"pandas/tests/indexes/multi/test_equivalence.py" = ["B905"]
526-
"pandas/tests/indexes/multi/test_get_set.py" = ["B905"]
527523
"pandas/tests/indexes/period/methods/test_asfreq.py" = ["B905"]
528524
"pandas/tests/indexes/period/test_constructors.py" = ["B905"]
529525
"pandas/tests/indexes/period/test_formats.py" = ["B905"]

0 commit comments

Comments
 (0)