Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 42 additions & 13 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2218,19 +2218,48 @@ def read_stata(
... # Operate on a single chunk, e.g., chunk.mean()
... pass # doctest: +SKIP
"""
reader = StataReader(
filepath_or_buffer,
convert_dates=convert_dates,
convert_categoricals=convert_categoricals,
index_col=index_col,
convert_missing=convert_missing,
preserve_dtypes=preserve_dtypes,
columns=columns,
order_categoricals=order_categoricals,
chunksize=chunksize,
storage_options=storage_options,
compression=compression,
)
try:
reader = StataReader(
filepath_or_buffer,
convert_dates=convert_dates,
convert_categoricals=convert_categoricals,
index_col=index_col,
convert_missing=convert_missing,
preserve_dtypes=preserve_dtypes,
columns=columns,
order_categoricals=order_categoricals,
chunksize=chunksize,
storage_options=storage_options,
compression=compression,
)
except ValueError as e:
# If users pass HTML/JSON/etc. (e.g., a GitHub page URL), StataReader
# often raises a version/format ValueError. Replace with a clearer message.
msg = str(e)
if (
"Version of given Stata file is" in msg
or "not a Stata dataset" in msg
or "not a valid Stata" in msg
):
base = (
"This is not a valid Stata dataset. This may be because it is not a "
"valid Stata dataset, or a Stata dataset from a version of Stata that "
"pandas cannot import. pandas supports importing versions 105, 108, "
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), "
"117 (Stata 13), 118 (Stata 14/15/16), and 119 (Stata 15/16, over 32, "
"767 variables)."
)
hint = ""
if isinstance(filepath_or_buffer, (str, os.PathLike)):
s = os.fspath(filepath_or_buffer)
if "github.com" in s and ("/blob/" in s or "/tree/" in s):
hint = (
" If you're loading from GitHub, use the Raw file URL "
"(replace '/blob/' with '/raw/' or click the 'Raw' button)."
)
raise ValueError(base + hint) from e
# Different error: keep original
raise

if iterator or chunksize:
return reader
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
Series,
)

from pandas.io import stata as stata_mod
from pandas.io.parsers import read_csv
from pandas.io.stata import (
CategoricalConversionWarning,
Expand Down Expand Up @@ -2620,3 +2621,26 @@ def test_ascii_error(temp_file, version):
df.to_stata(temp_file, write_index=0, version=version)
df_input = read_stata(temp_file)
tm.assert_frame_equal(df, df_input)


class _BoomReader:
def __init__(self, *a, **k):
raise ValueError("Version of given Stata file is 10.")


def test_non_stata_gives_clear_message(monkeypatch, tmp_path):
monkeypatch.setattr(stata_mod, "StataReader", _BoomReader)
with pytest.raises(ValueError, match=r"not a valid Stata dataset"):
read_stata(tmp_path / "not_stata.dta")


def test_github_blob_hint_is_appended(monkeypatch):
monkeypatch.setattr(stata_mod, "StataReader", _BoomReader)
with pytest.raises(ValueError, match=r"Raw file URL"):
read_stata("https://github.com/user/repo/blob/main/file.dta")


def test_github_tree_hint_is_appended(monkeypatch):
monkeypatch.setattr(stata_mod, "StataReader", _BoomReader)
with pytest.raises(ValueError, match=r"Raw file URL"):
read_stata("https://github.com/user/repo/tree/main/data")
Loading