BUG: Remove incorrect check on value label length (#60156)

bashtage · web-flow · commit e26e3eec231c · 2024-10-31T08:56:55.000-07:00
* BUG: Remove incorrect check on value label length Remove 32,000 limit on value limit check since this applies to the number of variable, not the length of the value labels closes #60107 * TST: Remove incorrect test Remove test of the error that was being incorrectly raised
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -687,6 +687,7 @@ I/O
 - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)
 - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
 - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
+- Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`)
 - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
 - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`)
 - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -691,12 +691,6 @@ def _prepare_value_labels(self) -> None:
             self.txt.append(category)
             self.n += 1
 
-        if self.text_len > 32000:
-            raise ValueError(
-                "Stata value labels for a single variable must "
-                "have a combined length less than 32,000 characters."
-            )
-
         # Ensure int32
         self.off = np.array(offsets, dtype=np.int32)
         self.val = np.array(values, dtype=np.int32)
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -3,7 +3,9 @@
 from datetime import datetime
 import gzip
 import io
+import itertools
 import os
+import string
 import struct
 import tarfile
 import zipfile
@@ -1163,28 +1165,13 @@ def test_categorical_writing(self, version, temp_file):
 
     def test_categorical_warnings_and_errors(self, temp_file):
         # Warning for non-string labels
-        # Error for labels too long
-        original = DataFrame.from_records(
-            [["a" * 10000], ["b" * 10000], ["c" * 10000], ["d" * 10000]],
-            columns=["Too_long"],
-        )
-
-        original = original.astype("category")
-        path = temp_file
-        msg = (
-            "Stata value labels for a single variable must have "
-            r"a combined length less than 32,000 characters\."
-        )
-        with pytest.raises(ValueError, match=msg):
-            original.to_stata(path)
-
         original = DataFrame.from_records(
             [["a"], ["b"], ["c"], ["d"], [1]], columns=["Too_long"]
         ).astype("category")
 
         msg = "data file created has not lost information due to duplicate labels"
         with tm.assert_produces_warning(ValueLabelTypeMismatch, match=msg):
-            original.to_stata(path)
+            original.to_stata(temp_file)
             # should get a warning for mixed content
 
     @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
@@ -2592,3 +2579,12 @@ def test_empty_frame(temp_file):
     df3 = read_stata(path, columns=["a"])
     assert "b" not in df3
     tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]])
+
+
+@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+def test_many_strl(temp_file, version):
+    n = 65534
+    df = DataFrame(np.arange(n), columns=["col"])
+    lbls = ["".join(v) for v in itertools.product(*([string.ascii_letters] * 3))]
+    value_labels = {"col": {i: lbls[i] for i in range(n)}}
+    df.to_stata(temp_file, value_labels=value_labels, version=version)