Merge pull request #272 from thwu1/nightly

jeffreysijuntan · web-flow · commit 0360a2546449 · 2025-10-27T23:58:35.000-07:00
Fix retokenization
diff --git a/rllm/engine/agent_execution_engine.py b/rllm/engine/agent_execution_engine.py
@@ -144,12 +144,12 @@ async def get_model_response(self, prompt, application_id, **kwargs) -> str:
 
         if self.engine_name == "openai":
             output = await self.rollout_engine.get_model_response(prompt, application_id=application_id, enforce_max_prompt_length=False, **sampling_params)
-            return output.text
+            return output
         elif self.engine_name == "verl":
             meta_data = sampling_params.pop("meta_info", {})
             validate = meta_data.get("validate", False)
             output = await self.rollout_engine.get_model_response(prompt, application_id=application_id, validate=validate, enforce_max_prompt_length=False, **sampling_params)
-            return output.text
+            return output
         else:
             raise NotImplementedError(f"Engine type '{self.engine_name}' not supported")
 
@@ -232,14 +232,17 @@ async def run_agent_trajectory_async(self, idx, application_id, seed=0, mode="Te
             kwargs["max_tokens"] = max_tokens
 
             start_time = time.time()
-            response = await self.get_model_response(prompt_messages, application_id, **kwargs)
+            model_output = await self.get_model_response(prompt_messages, application_id, **kwargs)
+            response = model_output.text
             delta_time = time.time() - start_time
             llm_time += delta_time
             total_time += delta_time
             # Update steps
             prompt_response_pair = {
                 "prompt": self.chat_parser.parse(prompt_messages, add_generation_prompt=True, is_first_msg=True),
                 "response": response,
+                "prompt_ids": model_output.prompt_ids,
+                "completion_ids": model_output.completion_ids,
             }
             episode_steps.append(prompt_response_pair)
 
@@ -379,10 +382,11 @@ async def run_agent_trajectory_async(self, idx, application_id, seed=0, mode="Te
         if mode == "Text":
             return trajectory
         elif mode == "Token":
+            prompt_tokens, response_tokens, response_masks, is_valid_trajectory = self.assemble_steps(episode_steps)
             token_result = {
-                "prompt_tokens": torch.tensor(prompt_tokens, dtype=torch.long),
-                "response_tokens": torch.tensor(response_tokens, dtype=torch.long),
-                "response_masks": torch.tensor(response_masks, dtype=torch.long),
+                "prompt_tokens": prompt_tokens,
+                "response_tokens": response_tokens,
+                "response_masks": response_masks,
                 "trajectory_reward": trajectory.reward,
                 "idx": env.idx,
                 "chat_completions": agent.chat_completions,
@@ -397,6 +401,7 @@ async def run_agent_trajectory_async(self, idx, application_id, seed=0, mode="Te
                     "llm_time": llm_time,
                     # Total time spent in the trajectory
                     "total_time": total_time,
+                    "token_mismatch": 0.0 if is_valid_trajectory else 1.0,
                 },
             }
             return token_result
@@ -410,6 +415,71 @@ async def run_agent_trajectory_async(self, idx, application_id, seed=0, mode="Te
                 "mc_returns": [step.mc_return for step in trajectory.steps][: len(episode_steps)],
             }
             return steps_result
+        else:
+            raise ValueError(f"Mode {mode} not supported")
+
+    def assemble_steps(self, steps: list[dict]):
+        """
+        Transform step-by-step results into trajectory format for training.
+        The assemble is aggresive, if steps is not cumulative, the response_masks is set to all 0s.
+
+        Each step_result contains:
+        - steps: List of {"prompt": str, "response": str, "prompt_ids": list, "completion_ids": list}
+
+        For training, we need to assemble the full conversation sequence where:
+        - prompt_tokens: Initial prompt (first step's prompt_ids)
+        - response_tokens: All subsequent conversation (completion_ids + next step's prompt_ids)
+        - response_masks: Mask indicating which tokens contribute to loss (only completion_ids)
+        """
+
+        # Start with initial prompt from first step
+        initial_prompt_ids = steps[0]["prompt_ids"]
+        accumulated_sequence = initial_prompt_ids.copy()
+        response_tokens = []
+        response_masks = []
+        is_valid_trajectory = True
+
+        for i, step in enumerate(steps):
+            current_prompt_ids = step["prompt_ids"]
+            current_completion_ids = step["completion_ids"]
+
+            if i == 0:
+                # First step: just add completion
+                response_tokens.extend(current_completion_ids)
+                response_masks.extend([1] * len(current_completion_ids))  # completion contributes to loss
+                accumulated_sequence.extend(current_completion_ids)
+            else:
+                if current_prompt_ids[: len(accumulated_sequence)] != accumulated_sequence:
+                    # Find the first differing position
+                    prefix = current_prompt_ids[: len(accumulated_sequence)]
+                    diff_pos = None
+                    for i, (expected, actual) in enumerate(zip(accumulated_sequence, prefix, strict=False)):
+                        if expected != actual:
+                            diff_pos = i
+                            break
+
+                    if diff_pos is not None:
+                        logger.warning(f"When assemble steps, detect the trajectory not accumulative at position {diff_pos}. Expected: {accumulated_sequence[diff_pos : diff_pos + 5]}, Got: {prefix[diff_pos : diff_pos + 5]}. Setting response_masks to all 0s. This is likely due to retokenization.")
+                    else:
+                        logger.warning(f"When assemble steps, detect length mismatch. Expected length: {len(accumulated_sequence)}, Got length: {len(prefix)}. Setting response_masks to all 0s.")
+
+                    is_valid_trajectory = False
+                    break
+
+                response_tokens.extend(current_prompt_ids[len(accumulated_sequence) :] + current_completion_ids)
+                response_masks.extend([0] * (len(current_prompt_ids) - len(accumulated_sequence)) + [1] * len(current_completion_ids))  # completion contributes to loss
+                accumulated_sequence = current_prompt_ids + current_completion_ids
+
+        assert len(response_masks) == len(response_tokens)
+
+        prompt_tokens = torch.tensor(initial_prompt_ids, dtype=torch.long)
+        response_tokens = torch.tensor(response_tokens, dtype=torch.long)
+        response_masks = torch.tensor(response_masks, dtype=torch.long)
+
+        if self.config.rllm.filter_token_mismatch:
+            response_masks = response_masks * int(is_valid_trajectory)
+
+        return prompt_tokens, response_tokens, response_masks, is_valid_trajectory
 
     async def run_agent_trajectory_with_retry(self, idx, application_id, seed=0, mode="Text", **kwargs):
         for _ in range(self.retry_limit):
diff --git a/rllm/trainer/config/agent_ppo_trainer.yaml b/rllm/trainer/config/agent_ppo_trainer.yaml
@@ -44,6 +44,7 @@ rllm:
   disable_thinking: False
   accumulate_reasoning: False
   mask_truncated_samples: False
+  filter_token_mismatch: True
   stepwise_advantage:
     enable: False
     mode: broadcast # [broadcast, per_step]
diff --git a/rllm/trainer/verl/agent_ppo_trainer_pipeline.py b/rllm/trainer/verl/agent_ppo_trainer_pipeline.py
@@ -139,6 +139,7 @@ def create_replay_queue(generator, q, batch_iter_val, timing_raw_val):
                     # Get the generator function which will yield results as they complete
                     if self.config.rllm.agent.step_advantage_broadcast:
                         raise Exception("Stepwise advantage broadcasting not supported on pipelined trainer yet")
+
                     gen_seq_generator = self.generate_agent_trajectories_async(timing_raw=timing_raw, meta_info=batch.meta_info)
                     thread = threading.Thread(target=create_replay_queue, args=(gen_seq_generator, replay_queue, batch_iter, timing_raw))
                     thread.start()