fix format

erranlli · erranlli · commit fda3768fb99d · 2025-10-31T17:35:00.000+08:00
diff --git a/rllm/engine/agent_execution_engine.py b/rllm/engine/agent_execution_engine.py
@@ -203,10 +203,17 @@ async def run_agent_trajectory_async(self, idx, application_id, seed=0, mode="Te
         messages = agent.chat_completions
         prompt_tokens, _ = convert_messages_to_tokens_and_masks(messages, tokenizer=self.tokenizer, parser=self.chat_parser, contains_first_msg=True, contains_generation_msg=True)
         prompt_token_len = len(prompt_tokens)
-        # Note, this should never happen!
+
+        # Check if initial prompt already exceeds max length
+        # This can happen if:
+        # 1. Dataset filtering didn't catch this sample (e.g., different tokenization)
+        # 2. Checkpoint contains cached dataset that wasn't filtered (delete checkpoint's data.pt)
         if prompt_token_len > self.max_prompt_length:
-            agent.reset()
-            raise Exception(f"Trajectory {idx}: initial prompt length {prompt_token_len} already exceeded max_prompt_length {self.max_prompt_length}, retrying")
+            logger.warning(f"Trajectory {idx}: Initial prompt length {prompt_token_len} exceeds max_prompt_length {self.max_prompt_length}. Skipping this sample entirely (no trajectory will be returned). First 200 chars of prompt: {self.chat_parser.parse(messages[:1], add_generation_prompt=False)[:200]}...")
+
+            # Close the environment and return None to skip this trajectory entirely
+            await loop.run_in_executor(self.executor, env.close)
+            return None
 
         for step_idx in range(self.max_steps):
             # Get action from agent
@@ -410,7 +417,11 @@ async def run_agent_trajectory_async(self, idx, application_id, seed=0, mode="Te
     async def run_agent_trajectory_with_retry(self, idx, application_id, seed=0, mode="Text", **kwargs):
         for _ in range(self.retry_limit):
             try:
-                return await asyncio.wait_for(self.run_agent_trajectory_async(idx, application_id=application_id, seed=seed, mode=mode, **kwargs), timeout=7200)
+                result = await asyncio.wait_for(self.run_agent_trajectory_async(idx, application_id=application_id, seed=seed, mode=mode, **kwargs), timeout=7200)
+                # If result is None, it means the trajectory was skipped (e.g., overlong prompt)
+                if result is None:
+                    return None
+                return result
             except Exception:
                 traceback.print_exc()
                 continue
@@ -452,10 +463,18 @@ async def launch_one_trajectory_task(env_idx: int):
         tasks_to_run = [launch_one_trajectory_task(i) for i in range(len(self.envs))]
 
         tasks_completed = 0
+        skipped_count = 0
         for coro in asyncio.as_completed(tasks_to_run):
             try:
                 result = await coro
                 tasks_completed += 1
+
+                # Skip None results (trajectories that were skipped due to overlong prompts)
+                if result is None:
+                    skipped_count += 1
+                    colorful_print(f"Number of Trajectories {tasks_completed}/{len(self.envs)} completed ({skipped_count} skipped due to overlong prompts)", "cyan")
+                    continue
+
                 colorful_print(f"Number of Trajectories {tasks_completed}/{len(self.envs)} completed", "cyan")
                 yield result
             except Exception as e:
diff --git a/rllm/trainer/verl/agent_ppo_trainer.py b/rllm/trainer/verl/agent_ppo_trainer.py
@@ -180,6 +180,18 @@ def fit_agent(self):
                         batch = self._pad_dataproto_to_world_size(batch=batch)
                     else:
                         final_gen_batch_output, generate_metrics = self.generate_agent_trajectory(timing_raw=timing_raw, meta_info=batch.meta_info)
+
+                        # If some trajectories were skipped (overlong prompts), filter the batch to match
+                        if "skipped_indices" in final_gen_batch_output.meta_info:
+                            skipped_indices = final_gen_batch_output.meta_info.pop("skipped_indices")
+                            # Create mask for valid (non-skipped) indices
+                            valid_mask = np.ones(len(batch.batch), dtype=bool)
+                            valid_mask[skipped_indices] = False
+                            # Filter batch to only include valid samples
+                            valid_indices = np.where(valid_mask)[0]
+                            batch = batch.select_idxs(valid_indices)
+                            print(f"Filtered batch from {len(valid_mask)} to {len(valid_indices)} samples after skipping {len(skipped_indices)} overlong prompts")
+
                         batch = batch.union(final_gen_batch_output)
                         metrics.update(generate_metrics)
 
@@ -551,16 +563,36 @@ def generate_agent_trajectory(self, timing_raw=None, meta_info=None):
             trajectories = []
             if self.async_rollout_mode:
                 gen_seq_generator = self.generate_agent_trajectories_async(timing_raw=timing_raw, meta_info=meta_info, mode="Token")
-                for _, trajectory in enumerate(gen_seq_generator):
-                    trajectories.append(trajectory)
+                for trajectory in gen_seq_generator:
+                    # Skip None trajectories (overlong prompts)
+                    if trajectory is not None:
+                        trajectories.append(trajectory)
             else:
                 raise ValueError("Only async rollout mode is supported")
+
+        # Check if all trajectories were skipped
+        if not trajectories:
+            raise RuntimeError("All trajectories were skipped (likely all prompts exceed max_prompt_length). Please check your dataset and increase max_prompt_length or enable filtering.")
+
         # Sort trajectories by their idx, to ensure they are in order.
         trajectories.sort(key=lambda x: x["idx"])
 
+        # Determine which indices were skipped by checking missing idx values
+        # Expected indices are 0 to (batch_size * rollout.n - 1)
+        expected_count = len(self.agent_execution_engine.envs)
+        actual_indices = set(t["idx"] for t in trajectories)
+        expected_indices = set(range(expected_count))
+        skipped_indices = sorted(expected_indices - actual_indices)
+
+        if skipped_indices:
+            print(f"Skipped {len(skipped_indices)} trajectories due to overlong prompts at env indices: {skipped_indices}")
+
         with marked_timer("transform_trajectory", timing_raw):
             # Transform the raw trajectories into DataProto format.
             final_gen_batch_output, metrics = self._transform_agent_trajectories(trajectories)
+            # Store skipped indices in meta_info for potential filtering of original batch
+            if skipped_indices:
+                final_gen_batch_output.meta_info["skipped_indices"] = skipped_indices
         return final_gen_batch_output, metrics
 
     def generate_agent_steps(self, timing_raw=None, meta_info=None, uids=None):