merge AgentTrainer with PipelineAgentTrainer

jeffreysijuntan · jeffreysijuntan · commit e8d2be183a49 · 2025-11-02T11:58:31.000-08:00
diff --git a/examples/eval_protocol/train_frozen_lake_flow.py b/examples/eval_protocol/train_frozen_lake_flow.py
@@ -2,15 +2,15 @@
 
 from examples.eval_protocol.frozen_lake_flow import FrozenLakeWorkflow
 from rllm.data.dataset import DatasetRegistry
-from rllm.trainer.pipeline_agent_trainer import PipelineAgentTrainer
+from rllm.trainer.agent_trainer import AgentTrainer
 
 
 @hydra.main(config_path="pkg://rllm.trainer.config", config_name="agent_ppo_trainer", version_base=None)
 def main(config):
     train_dataset = DatasetRegistry.load_dataset("frozen_lake_eval_protocol", "train")
     test_dataset = DatasetRegistry.load_dataset("frozen_lake_eval_protocol", "test")
 
-    trainer = PipelineAgentTrainer(
+    trainer = AgentTrainer(
         workflow_class=FrozenLakeWorkflow,
         workflow_args={
             "lite_llm_prefix": "fireworks_ai/",
@@ -21,6 +21,7 @@ def main(config):
         config=config,
         train_dataset=train_dataset,
         val_dataset=test_dataset,
+        backend="fireworks",
     )
     trainer.train()
 
diff --git a/examples/fireworks_math/README.md b/examples/fireworks_math/README.md
@@ -1,14 +1,24 @@
 ## Before Running Your Training Job
 
+First, install Fireworks SDK and export your FIREWORKS_API_KEY
+
+```bash
+pip install fireworks-ai
+```
+
+```bash
+export FIREWORKS_API_KEY=<YOUR_FIREWORKS_API_KEY>
+```
+
 Before starting your training, create a **Fireworks deployment**.
 
-I recommend installing **firectl** by following the guide here:
+We recommend installing **firectl** by following the guide here:
 [firectl Documentation](https://docs.fireworks.ai/tools-sdks/firectl/firectl)
 
 Then, create your deployment:
 
 ```bash
-firectl create deployment accounts/fireworks/models/qwen3-30b-a3b-instruct-2507   --enable-hot-reload-latest-addon   --deployment-id <YOUR_CUSTOM_DEPLOYMENT_ID>   --accelerator-type NVIDIA_H100_80GB
+firectl create deployment accounts/fireworks/models/accounts/fireworks/models/qwen3-4b --enable-hot-reload-latest-addon   --deployment-id <YOUR_CUSTOM_DEPLOYMENT_ID>   --accelerator-type NVIDIA_H100_80GB
 ```
 
 ---
diff --git a/examples/fireworks_math/train_fireworks_math.py b/examples/fireworks_math/train_fireworks_math.py
@@ -5,8 +5,11 @@
 from rllm.engine.rollout.rollout_engine import ModelOutput
 from rllm.rewards.reward_fn import math_reward_fn
 from rllm.rewards.reward_types import RewardOutput
-from rllm.trainer.pipeline_agent_trainer import PipelineAgentTrainer
-from rllm.workflows.single_turn_workflow import SingleTurnWorkflow
+from rllm.trainer.agent_trainer import AgentTrainer
+from rllm.workflows.simple_workflow import SimpleWorkflow
+
+# from rllm.agents.math_agent import MathAgent
+# from rllm.environments.base.single_turn_env import SingleTurnEnvironment
 
 
 def math_workflow_reward_fn(task_info: dict, action: str) -> RewardOutput:
@@ -22,8 +25,8 @@ def main(config):
     train_dataset = DatasetRegistry.load_dataset("hendrycks_math", "train")
     test_dataset = DatasetRegistry.load_dataset("math500", "test")
 
-    trainer = PipelineAgentTrainer(
-        workflow_class=SingleTurnWorkflow,
+    trainer = AgentTrainer(
+        workflow_class=SimpleWorkflow,
         workflow_args={
             "reward_function": math_workflow_reward_fn,
             "max_prompt_length": config.data.max_prompt_length,
@@ -32,6 +35,7 @@ def main(config):
         config=config,
         train_dataset=train_dataset,
         val_dataset=test_dataset,
+        backend="fireworks",
     )
     trainer.train()
 
diff --git a/examples/fireworks_math/train_fireworks_math.sh b/examples/fireworks_math/train_fireworks_math.sh
@@ -16,7 +16,7 @@ python3 -m examples.fireworks_math.train_fireworks_math \
     data.train_batch_size=8 \
     data.val_batch_size=512 \
     data.max_prompt_length=4096 \
-    data.max_response_length=16384 \
+    data.max_response_length=2048 \
     actor_rollout_ref.model.lora_rank=32 \
     actor_rollout_ref.model.lora_alpha=32 \
     actor_rollout_ref.rollout.load_format=safetensors \
@@ -54,7 +54,7 @@ python3 -m examples.fireworks_math.train_fireworks_math \
     actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.entropy_coeff=0 \
     algorithm.kl_ctrl.kl_coef=0.001 \
-    rllm.compact_filtering.enable=True \
+    rllm.compact_filtering.enable=False \
     rllm.compact_filtering.mask_max_prompt_length_exceeded=True \
     rllm.compact_filtering.mask_max_response_length_exceeded=True \
     rllm.compact_filtering.mask_max_turns_exceeded=False \
@@ -68,14 +68,14 @@ python3 -m examples.fireworks_math.train_fireworks_math \
     trainer.project_name='rllm-fireworks-workflow' \
     trainer.experiment_name='fireworks-hendrycks-math-4b' \
     trainer.max_actor_ckpt_to_keep=2 \
-    trainer.val_before_train=True \
-    trainer.n_gpus_per_node=8 \
-    +trainer.n_training_gpus_per_node=8 \
+    trainer.val_before_train=False \
+    trainer.n_gpus_per_node=2 \
+    +trainer.n_training_gpus_per_node=2 \
     trainer.nnodes=1 \
     trainer.save_freq=1 \
     trainer.test_freq=10 \
     trainer.default_hdfs_dir=null \
     trainer.total_epochs=100 \
     rllm.workflow.use_workflow=True \
-    fireworks.deployment_id=qwen3-4b-3 \
-    fireworks.model_id_prefix=test-math-qwen3-4b-3
+    fireworks.deployment_id=wtk15cs9 \
+    fireworks.model_id_prefix=qwen3-4b-math
diff --git a/rllm/trainer/agent_trainer.py b/rllm/trainer/agent_trainer.py
@@ -1,16 +1,22 @@
-from typing import Any
+from typing import Any, Literal
 
 import ray
 
 from rllm.data import Dataset
 from rllm.trainer.verl.ray_runtime_env import get_ppo_ray_runtime_env
 from rllm.trainer.verl.train_agent_ppo import TaskRunner
+from rllm.trainer.verl.train_workflow_pipeline import PipelineTaskRunner
+from verl.trainer.constants_ppo import get_ppo_ray_runtime_env as get_fireworks_ray_runtime_env
 
 
 class AgentTrainer:
     """
     A wrapper class that allows users to easily train custom agents with custom environments
     without having to directly interact with the underlying training infrastructure.
+
+    Supports two backends:
+    - 'verl' (default): Standard training backend supporting both workflow and agent/env classes
+    - 'fireworks': Pipeline-based training backend optimized for workflow-based training
     """
 
     def __init__(
@@ -24,23 +30,39 @@ def __init__(
         config: dict[str, Any] | list[str] | None = None,
         train_dataset: Dataset | None = None,
         val_dataset: Dataset | None = None,
+        backend: Literal["verl", "fireworks"] = "verl",
     ):
         """
         Initialize the AgentTrainer.
 
         Args:
+            workflow_class: The workflow class to use for training
+            workflow_args: Optional arguments to pass to the workflow class
             agent_class: The custom agent class to use for training
             env_class: The custom environment class to use for training
+            agent_args: Optional arguments to pass to the agent class
+            env_args: Optional arguments to pass to the environment class
             config: Configuration overrides to apply to the default config
                    Can be a dictionary with dot notation keys (e.g., {"data.train_batch_size": 8})
                    or a list of strings in the format "key=value" (e.g., ["data.train_batch_size=8"])
             train_dataset: Optional train dataset to use
             val_dataset: Optional validation dataset to use
-            agent_args: Optional arguments to pass to the agent class
-            env_args: Optional arguments to pass to the environment class
+            backend: Training backend to use ('verl' or 'fireworks'). Default is 'verl'
         """
+        # Validate backend
+        if backend not in ["verl", "fireworks"]:
+            raise ValueError(f"backend must be either 'verl' or 'fireworks', got '{backend}'")
+
+        self.backend = backend
+
+        # Validate backend-specific requirements
+        if backend == "fireworks":
+            if agent_class is not None or env_class is not None:
+                raise ValueError("The 'fireworks' backend only supports workflow_class. agent_class and env_class are not supported. Use workflow_args to configure agent and environment.")
+            if agent_args is not None or env_args is not None:
+                raise ValueError("The 'fireworks' backend does not support agent_args or env_args. Use workflow_args to configure the workflow.")
 
-        if workflow_class is not None and config.rllm.workflow.use_workflow:
+        if workflow_class is not None and config is not None and hasattr(config, "rllm") and hasattr(config.rllm, "workflow") and config.rllm.workflow.use_workflow:
             if agent_class is not None:
                 raise ValueError("agent_class is not supported when using workflow, instead use workflow_args['agent_cls']")
             if agent_args is not None:
@@ -66,6 +88,21 @@ def __init__(
             self.config.data.val_files = val_dataset.get_verl_data_path()
 
     def train(self):
+        """
+        Start the training process using the specified backend.
+        """
+        if self.backend == "verl":
+            self._train_with_verl()
+        elif self.backend == "fireworks":
+            self._train_with_fireworks()
+        else:
+            raise ValueError(f"Unknown backend: {self.backend}")
+
+    def _train_with_verl(self):
+        """
+        Train using the standard verl backend.
+        Supports both workflow-based and agent/env-based training.
+        """
         # Check if Ray is not initialized
         if not ray.is_initialized():
             # read off all the `ray_init` settings from the config
@@ -88,3 +125,21 @@ def train(self):
                 env_args=self.env_args,
             )
         )
+
+    def _train_with_fireworks(self):
+        """
+        Train using the fireworks (pipeline) backend.
+        Optimized for workflow-based training with the Fireworks API.
+        """
+        if not ray.is_initialized():
+            ray.init(runtime_env=get_fireworks_ray_runtime_env(), num_cpus=self.config.ray_init.num_cpus)
+
+        runner = PipelineTaskRunner.remote()
+
+        ray.get(
+            runner.run.remote(
+                config=self.config,
+                workflow_class=self.workflow_class,
+                workflow_args=self.workflow_args,
+            )
+        )
diff --git a/rllm/trainer/pipeline_agent_trainer.py b/rllm/trainer/pipeline_agent_trainer.py