pandas-dev · oosei25 · Nov 2, 2025
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -2218,19 +2218,48 @@ def read_stata(
     ...         # Operate on a single chunk, e.g., chunk.mean()
     ...         pass  # doctest: +SKIP
     """
-    reader = StataReader(
-        filepath_or_buffer,
-        convert_dates=convert_dates,
-        convert_categoricals=convert_categoricals,
-        index_col=index_col,
-        convert_missing=convert_missing,
-        preserve_dtypes=preserve_dtypes,
-        columns=columns,
-        order_categoricals=order_categoricals,
-        chunksize=chunksize,
-        storage_options=storage_options,
-        compression=compression,
-    )
+    try:
+        reader = StataReader(
+            filepath_or_buffer,
+            convert_dates=convert_dates,
+            convert_categoricals=convert_categoricals,
+            index_col=index_col,
+            convert_missing=convert_missing,
+            preserve_dtypes=preserve_dtypes,
+            columns=columns,
+            order_categoricals=order_categoricals,
+            chunksize=chunksize,
+            storage_options=storage_options,
+            compression=compression,
+        )
+    except ValueError as e:
+        # If users pass HTML/JSON/etc. (e.g., a GitHub page URL), StataReader
+        # often raises a version/format ValueError. Replace with a clearer message.
+        msg = str(e)
+        if (
+            "Version of given Stata file is" in msg
+            or "not a Stata dataset" in msg
+            or "not a valid Stata" in msg
+        ):
+            base = (
+                "This is not a valid Stata dataset. This may be because it is not a "
+                "valid Stata dataset, or a Stata dataset from a version of Stata that "
+                "pandas cannot import. pandas supports importing versions 105, 108, "
+                "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), "
+                "117 (Stata 13), 118 (Stata 14/15/16), and 119 (Stata 15/16, over 32, "
+                "767 variables)."
+            )
+            hint = ""
+            if isinstance(filepath_or_buffer, (str, os.PathLike)):
+                s = os.fspath(filepath_or_buffer)
+                if "github.com" in s and ("/blob/" in s or "/tree/" in s):
+                    hint = (
+                        " If you're loading from GitHub, use the Raw file URL "
+                        "(replace '/blob/' with '/raw/' or click the 'Raw' button)."
+                    )
+            raise ValueError(base + hint) from e
+        # Different error: keep original
+        raise
 
     if iterator or chunksize:
         return reader

diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -24,6 +24,7 @@
     Series,
 )
 
+from pandas.io import stata as stata_mod
 from pandas.io.parsers import read_csv
 from pandas.io.stata import (
     CategoricalConversionWarning,
@@ -2620,3 +2621,26 @@ def test_ascii_error(temp_file, version):
     df.to_stata(temp_file, write_index=0, version=version)
     df_input = read_stata(temp_file)
     tm.assert_frame_equal(df, df_input)
+
+
+class _BoomReader:
+    def __init__(self, *a, **k):
+        raise ValueError("Version of given Stata file is 10.")
+
+
+def test_non_stata_gives_clear_message(monkeypatch, tmp_path):
+    monkeypatch.setattr(stata_mod, "StataReader", _BoomReader)
+    with pytest.raises(ValueError, match=r"not a valid Stata dataset"):
+        read_stata(tmp_path / "not_stata.dta")
+
+
+def test_github_blob_hint_is_appended(monkeypatch):
+    monkeypatch.setattr(stata_mod, "StataReader", _BoomReader)
+    with pytest.raises(ValueError, match=r"Raw file URL"):
+        read_stata("https://github.com/user/repo/blob/main/file.dta")
+
+
+def test_github_tree_hint_is_appended(monkeypatch):
+    monkeypatch.setattr(stata_mod, "StataReader", _BoomReader)
+    with pytest.raises(ValueError, match=r"Raw file URL"):
+        read_stata("https://github.com/user/repo/tree/main/data")