diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 1f953650365ef..0137321b0b435 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2218,19 +2218,48 @@ def read_stata( ... # Operate on a single chunk, e.g., chunk.mean() ... pass # doctest: +SKIP """ - reader = StataReader( - filepath_or_buffer, - convert_dates=convert_dates, - convert_categoricals=convert_categoricals, - index_col=index_col, - convert_missing=convert_missing, - preserve_dtypes=preserve_dtypes, - columns=columns, - order_categoricals=order_categoricals, - chunksize=chunksize, - storage_options=storage_options, - compression=compression, - ) + try: + reader = StataReader( + filepath_or_buffer, + convert_dates=convert_dates, + convert_categoricals=convert_categoricals, + index_col=index_col, + convert_missing=convert_missing, + preserve_dtypes=preserve_dtypes, + columns=columns, + order_categoricals=order_categoricals, + chunksize=chunksize, + storage_options=storage_options, + compression=compression, + ) + except ValueError as e: + # If users pass HTML/JSON/etc. (e.g., a GitHub page URL), StataReader + # often raises a version/format ValueError. Replace with a clearer message. + msg = str(e) + if ( + "Version of given Stata file is" in msg + or "not a Stata dataset" in msg + or "not a valid Stata" in msg + ): + base = ( + "This is not a valid Stata dataset. This may be because it is not a " + "valid Stata dataset, or a Stata dataset from a version of Stata that " + "pandas cannot import. pandas supports importing versions 105, 108, " + "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), " + "117 (Stata 13), 118 (Stata 14/15/16), and 119 (Stata 15/16, over 32, " + "767 variables)." + ) + hint = "" + if isinstance(filepath_or_buffer, (str, os.PathLike)): + s = os.fspath(filepath_or_buffer) + if "github.com" in s and ("/blob/" in s or "/tree/" in s): + hint = ( + " If you're loading from GitHub, use the Raw file URL " + "(replace '/blob/' with '/raw/' or click the 'Raw' button)." + ) + raise ValueError(base + hint) from e + # Different error: keep original + raise if iterator or chunksize: return reader diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index b44f595e73670..1d2b3ca928266 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -24,6 +24,7 @@ Series, ) +from pandas.io import stata as stata_mod from pandas.io.parsers import read_csv from pandas.io.stata import ( CategoricalConversionWarning, @@ -2620,3 +2621,26 @@ def test_ascii_error(temp_file, version): df.to_stata(temp_file, write_index=0, version=version) df_input = read_stata(temp_file) tm.assert_frame_equal(df, df_input) + + +class _BoomReader: + def __init__(self, *a, **k): + raise ValueError("Version of given Stata file is 10.") + + +def test_non_stata_gives_clear_message(monkeypatch, tmp_path): + monkeypatch.setattr(stata_mod, "StataReader", _BoomReader) + with pytest.raises(ValueError, match=r"not a valid Stata dataset"): + read_stata(tmp_path / "not_stata.dta") + + +def test_github_blob_hint_is_appended(monkeypatch): + monkeypatch.setattr(stata_mod, "StataReader", _BoomReader) + with pytest.raises(ValueError, match=r"Raw file URL"): + read_stata("https://github.com/user/repo/blob/main/file.dta") + + +def test_github_tree_hint_is_appended(monkeypatch): + monkeypatch.setattr(stata_mod, "StataReader", _BoomReader) + with pytest.raises(ValueError, match=r"Raw file URL"): + read_stata("https://github.com/user/repo/tree/main/data")