fix: Gracefully skip overlong prompts during training to prevent crashes (#281)

erranlli · web-flow · commit 2956f869b472 · 2025-10-31T16:44:39.000-07:00
diff --git a/rllm/data/dataset.py b/rllm/data/dataset.py
@@ -3,6 +3,7 @@
 import os
 from typing import Any
 
+import numpy as np
 import pandas as pd
 import polars as pl
 import torch
@@ -373,6 +374,27 @@ def remove_dataset(cls, name: str) -> bool:
         logger.info(f"Removed dataset '{name}' from registry.")
         return True
 
+    @staticmethod
+    def _convert_to_json_serializable(obj: Any) -> Any:
+        """Convert numpy arrays and other non-serializable objects to JSON-serializable types.
+
+        Args:
+            obj: Object to convert
+
+        Returns:
+            JSON-serializable version of the object
+        """
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, np.integer | np.floating):
+            return obj.item()
+        elif isinstance(obj, dict):
+            return {key: DatasetRegistry._convert_to_json_serializable(value) for key, value in obj.items()}
+        elif isinstance(obj, list | tuple):
+            return [DatasetRegistry._convert_to_json_serializable(item) for item in obj]
+        else:
+            return obj
+
     @classmethod
     def apply_verl_postprocessing(cls, data: list[dict[str, Any]]) -> list[dict[str, Any]]:
         """Apply Verl postprocessing to the dataset.
@@ -382,16 +404,27 @@ def apply_verl_postprocessing(cls, data: list[dict[str, Any]]) -> list[dict[str,
 
         Returns:
             List of dictionaries with Verl-compatible format
+
+        Note:
+            All nested structures (lists, dicts) are JSON-serialized to avoid
+            PyArrow "Nested data conversions not implemented for chunked array outputs"
+            error when loading from Parquet in distributed contexts.
         """
         processed_data = []
         for entry in data:
+            # Convert numpy arrays to lists before JSON serialization
+            serializable_entry = cls._convert_to_json_serializable(entry)
+
             processed_entry = {
-                "prompt": [{"role": "user", "content": "placeholder"}],
-                "reward_model": {
-                    "style": "rule",
-                    "ground_truth": None,
-                },
-                "extra_info": entry,
+                # Serialize nested structures as JSON strings to avoid PyArrow chunked array issues
+                "prompt": json.dumps([{"role": "user", "content": "placeholder"}]),
+                "reward_model": json.dumps(
+                    {
+                        "style": "rule",
+                        "ground_truth": None,
+                    }
+                ),
+                "extra_info": json.dumps(serializable_entry),
             }
             processed_data.append(processed_entry)
         return processed_data