rllm-org
diff --git a/‎examples/omni_trainer/README.md‎
Lines changed: 100 additions & 0 deletions b/‎examples/omni_trainer/README.md‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎examples/omni_trainer/simple_math/train_hendrycks_math.py‎
Lines changed: 47 additions & 0 deletions b/‎examples/omni_trainer/simple_math/train_hendrycks_math.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎examples/omni_trainer/simple_math/train_hendrycks_math.sh‎
Lines changed: 73 additions & 0 deletions b/‎examples/omni_trainer/simple_math/train_hendrycks_math.sh‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎examples/omni_trainer/solver_judge_workflow/simple_solver_judge_flow.py‎
Lines changed: 116 additions & 0 deletions b/‎examples/omni_trainer/solver_judge_workflow/simple_solver_judge_flow.py‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎examples/omni_trainer/solver_judge_workflow/train_solver_judge_flow.py‎
Lines changed: 35 additions & 0 deletions b/‎examples/omni_trainer/solver_judge_workflow/train_solver_judge_flow.py‎
Lines changed: 35 additions & 0 deletions
@@ -0,0 +1,100 @@
+# Omni Trainer
+
+This example demonstrates how to use the Omni Trainer for reinforcement learning with language models.
+
+## Prerequisites
+
+### 1. Install Verl
+
+Run the installation script:
+```bash
+bash scripts/install_verl.sh
+```
+
+**Important:** Make sure to install `torch==2.6.0` when installing Verl. After `install_verl.sh` finishes, install `vllm==0.10.0`. You should see your torch version bumped to 2.7.1 after this - this is expected behavior.
+
+**Troubleshooting:**
+- If you encounter issues with `flash_attn`, reinstall it with:
+  ```bash
+  pip install flash-attn --no-build-isolation
+  ```
+
+- If you encounter errors with Ray, try:
+  ```bash
+  pip install ray==2.48.0
+  ```
+
+### 2. Install Episodic
+
+Download and install [episodic](https://github.com/agentica-org/episodic):
+```bash
+cd rllm/sdk/episodic-sdk
+pip install -e .
+```
+
+### 3. Verify Dependencies
+
+Check that your websocket version is >= 15.0 (version 13.x will not work).
+
+## Setup
+
+### 1. Launch the Context Store
+
+Start the episodic context store server:
+
+```bash
+episodic serve --db-path /tmp/episodic.db  # choose a local path for better performance
+```
+
+### 2. Deploy the LiteLLM Proxy
+
+In a separate terminal, start the LiteLLM proxy server:
+
+```bash
+#!/bin/bash
+
+# Set ulimit first
+ulimit -n 65536
+
+# Set aiohttp connection limits
+export AIOHTTP_CONNECTOR_LIMIT=4096
+export AIOHTTP_KEEPALIVE_TIMEOUT=60
+
+# Verify the limits are set
+echo "Current ulimit -n: $(ulimit -n)"
+echo "AIOHTTP_CONNECTOR_LIMIT: $AIOHTTP_CONNECTOR_LIMIT"
+echo "AIOHTTP_KEEPALIVE_TIMEOUT: $AIOHTTP_KEEPALIVE_TIMEOUT"
+echo "Starting LiteLLM proxy..."
+
+# Start the proxy
+python scripts/litellm_proxy_server.py \
+  --config litellm_proxy_config_autogen.yaml \
+  --host 127.0.0.1 \
+  --port 4000 \
+  --state-dir /tmp/litellm_proxy \
+  --cs-endpoint http://localhost:8000 \
+  --cs-api-key "your-api-key-here" \
+  --project rllm-agent-omni-engine \
+  --admin-token my-shared-secret
+```
+
+
+## Running the Examples
+
+Once both the context store and LiteLLM proxy are running, you can execute one of the training examples:
+
+### Hendrycks Math Training
+
+This is the simplest example with a single agent and single turn.
+
+```bash
+./train_hendrycks_math.sh
+```
+
+### Solver-Judge Flow Training
+
+This is a more complex example with 2 agents and more complex grouping logic.
+
+```bash
+./train_solver_judge_flow.sh
+```
@@ -0,0 +1,47 @@
+import hydra
+
+from rllm.data.dataset import DatasetRegistry
+from rllm.rewards.reward_fn import math_reward_fn
+from rllm.sdk.shortcuts import get_chat_client
+from rllm.trainer.agent_trainer import AgentTrainer
+from rllm.workflows.simple_workflow import SimpleWorkflow
+
+
+@hydra.main(config_path="pkg://rllm.trainer.config", config_name="agent_ppo_trainer", version_base=None)
+def main(config):
+    train_dataset = DatasetRegistry.load_dataset("hendrycks_math", "train")
+    test_dataset = DatasetRegistry.load_dataset("math500", "test")
+
+    # Define run function that recreates the client inside to avoid closure capture
+    # This ensures the function is fully serializable for Ray
+    def run(
+        question: str,
+        ground_truth: str,
+        base_url: str = "http://localhost:4000/v1",
+        api_key: str = "EMPTY",
+        **kwargs,
+    ):
+        # Recreate the client inside the function to avoid serialization issues
+        # This ensures the function doesn't capture non-serializable objects
+        client = get_chat_client(base_url=base_url, api_key=api_key)
+        response = client.chat.completions.create(
+            model="vllm/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+            messages=[
+                {"role": "user", "content": question},
+            ],
+        )
+        response_text = response.choices[0].message.content
+        reward = math_reward_fn({"response": response_text, "ground_truth": ground_truth}, response_text).reward
+        return reward
+
+    trainer = AgentTrainer(
+        config=config,
+        train_dataset=train_dataset,
+        val_dataset=test_dataset,
+        agent_run_func=run,
+    )
+    trainer.train()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,73 @@
+set -x
+
+export VLLM_ATTENTION_BACKEND=FLASH_ATTN
+export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:False"
+export VLLM_USE_V1=1
+export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
+export VLLM_ENGINE_ITERATION_TIMEOUT_S=100000000000
+
+# Find the directory where rllm package is located
+RLLM_DIR=$(python3 -c "import rllm; import os; print(os.path.dirname(os.path.dirname(rllm.__file__)))")
+
+MODEL_PATH=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+
+python3 -m examples.omni_trainer.simple_math.train_hendrycks_math \
+    algorithm.adv_estimator=grpo \
+    data.train_batch_size=32 \
+    data.val_batch_size=512 \
+    data.max_prompt_length=2048 \
+    data.max_response_length=2048 \
+    actor_rollout_ref.model.path=$MODEL_PATH \
+    actor_rollout_ref.hybrid_engine=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    actor_rollout_ref.actor.loss_agg_mode=token-mean \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=32 \
+    actor_rollout_ref.actor.use_dynamic_bsz=True \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=30000 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.mode="async" \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.rollout.temperature=0.6 \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.rollout.val_kwargs.temperature=0.6 \
+    actor_rollout_ref.rollout.val_kwargs.top_p=0.9 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    algorithm.kl_ctrl.kl_coef=0.001 \
+    rllm.mask_truncated_samples=False \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name='rllm-agent' \
+    trainer.experiment_name='simple-math-simple-workflow' \
+    trainer.val_before_train=True \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=200 \
+    trainer.test_freq=10 \
+    trainer.default_hdfs_dir=null \
+    rllm.agent.max_steps=1 \
+    rllm.stepwise_advantage.enable=False \
+    rllm.workflow.use_workflow=True \
+    trainer.total_epochs=100 \
+    +rllm.proxy.host=127.0.0.1 \
+    +rllm.proxy.port=4000 \
+    +rllm.proxy.auto_start=False \
+    +rllm.proxy.admin_token=my-shared-secret \
+    +rllm.run_name=rllm-agent-omni-engine \
+    +context_store.endpoint=http://localhost:8000 \
+    +context_store.api_key=your-api-key-here
@@ -0,0 +1,116 @@
+import asyncio
+import re
+import uuid
+
+from rllm.agents.agent import Episode, Trajectory
+from rllm.engine import RolloutEngine
+from rllm.rewards.reward_fn import RewardFunction
+from rllm.sdk import get_chat_client_async, session, set_reward_async
+from rllm.workflows.workflow import Workflow
+
+
+class Solver:
+    def __init__(self, **kwargs):
+        self.client = get_chat_client_async(base_url="http://localhost:4000/v1", api_key="EMPTY", model="vllm/Qwen/Qwen3-4B-Instruct-2507")
+
+    async def generate_solution(self, problem: str) -> Trajectory:
+        with session(agent="solver", groupby_key=str(uuid.uuid4())):
+            messages = [{"role": "user", "content": f"{problem}. Output the final answer within <answer>...</answer>"}]
+            response = await self.client.chat.completions.create(
+                messages=messages,
+                temperature=1,
+                max_tokens=1000,
+            )
+
+        content = response.choices[0].message.content
+        return response.id, self._parse_solver_response(content)
+
+    async def generate_solutions(self, problem: str, n_solutions: int = 2) -> list[Trajectory]:
+        tasks = [asyncio.create_task(self.generate_solution(problem)) for _ in range(n_solutions)]
+        return await asyncio.gather(*tasks)
+
+    def _parse_solver_response(self, response: str) -> str:
+        answer_match = re.search(r"<answer>(.*?)</answer>", response, re.IGNORECASE | re.DOTALL)
+        if answer_match:
+            return f"<answer>{answer_match.group(1).strip()}</answer>"
+        else:
+            return "No solution found"
+
+
+class Judge:
+    def __init__(self, **kwargs):
+        self.client = get_chat_client_async(base_url="http://localhost:4000/v1", api_key="EMPTY", model="vllm/Qwen/Qwen3-4B-Instruct-2507")
+
+    async def judge_solutions(self, problem: str, solutions: list[str]) -> Trajectory:
+        with session(agent="judge"):
+            messages = [{"role": "user", "content": self._create_judge_prompt(problem, solutions)}]
+            response = await self.client.chat.completions.create(
+                messages=messages,
+                temperature=1,
+                max_tokens=1000,
+            )
+        content = response.choices[0].message.content
+        return response.id, self._parse_judge_response(content, solutions)
+
+    def _parse_judge_response(self, response: str, solutions: list[str]) -> str:
+        answer_match = re.search(r"<answer>(.*?)</answer>", response, re.IGNORECASE | re.DOTALL)
+        if answer_match:
+            answer_text = answer_match.group(1).strip()
+            try:
+                solution_index = int(answer_text)
+                return solutions[solution_index - 1]
+            except (ValueError, IndexError):
+                return ""
+        return ""
+
+    def _create_judge_prompt(self, problem: str, solutions: list[str]) -> str:
+        """Create a prompt for the judge to evaluate solutions."""
+        prompt = f"""You are an expert verifier. Given a countdown problem and multiple solution attempts, select a correct solution.
+Problem:
+{problem}
+Solutions to evaluate:
+"""
+        for i, solution in enumerate(solutions, 1):
+            prompt += f"\nSolution {i}:\n{solution}\n"
+
+        prompt += """
+A correct solution must satisfy the following criteria:
+1. The solution uses only the given numbers.
+2. Each number is used exactly once.
+3. Only basic arithmetic operations (+, -, *, /) are used.
+4. The calculation results in the target number.
+5. The final answer is clearly marked within <answer>...</answer> tags.
+Output the index of your selected solution within <answer>...</answer> tags, e.g., <answer>1</answer> for the first solution, <answer>2</answer> for the second solution, etc. If multiple solutions are correct, output the index of the first correct solution."""
+        return prompt
+
+
+class SolverJudgeWorkflow(Workflow):
+    def __init__(self, rollout_engine: RolloutEngine, n_solutions: int = 2, reward_function: RewardFunction = None, **kwargs):
+        super().__init__(rollout_engine, **kwargs)
+        self.n_solutions = n_solutions
+        self.reward_function = reward_function
+        self.solver = Solver()
+        self.judge = Judge()
+
+    async def run(self, task: dict, uid: str, **kwargs) -> Episode:
+        self.reset(task, uid)
+        problem = task["question"]
+
+        # Step 1: Solver generates multiple solutions in parallel
+        solver_trajectories = await self.solver.generate_solutions(problem, self.n_solutions)
+
+        # Assign rewards to solver trajectories
+        solutions = []
+        for response_id, solution in solver_trajectories:
+            solutions.append(solution)
+            reward = self.reward_function(task, solution).reward
+            await set_reward_async(response_id, reward=reward)
+
+        # Step 2: Judge selects the best solution
+        response_id, selected_solution = await self.judge.judge_solutions(problem, solutions)
+
+        # Evaluate the selected solution
+        reward_result = self.reward_function(task, selected_solution)
+
+        await set_reward_async(response_id, reward=reward_result.reward)
+        return reward_result.reward
@@ -0,0 +1,35 @@
+import hydra
+
+from examples.omni_trainer.solver_judge_workflow.simple_solver_judge_flow import SolverJudgeWorkflow
+from rllm.data.dataset import DatasetRegistry
+from rllm.rewards.countdown_reward import countdown_reward_fn
+from rllm.trainer.agent_trainer import AgentTrainer
+
+
+async def run_workflow(**kwargs) -> float:
+    task = kwargs
+    workflow = SolverJudgeWorkflow(rollout_engine=None, executor=None, n_solutions=2, reward_function=countdown_reward_fn)
+    return await workflow.run(task, "")
+
+
+@hydra.main(config_path="pkg://rllm.trainer.config", config_name="agent_ppo_trainer", version_base=None)
+def main(config):
+    train_dataset = DatasetRegistry.load_dataset("countdown", "train")
+    test_dataset = DatasetRegistry.load_dataset("countdown", "test")
+
+    trainer = AgentTrainer(
+        agent_run_func=run_workflow,
+        workflow_class=SolverJudgeWorkflow,
+        workflow_args={
+            "n_solutions": 2,
+            "reward_function": countdown_reward_fn,
+        },
+        config=config,
+        train_dataset=train_dataset,
+        val_dataset=test_dataset,
+    )
+    trainer.train()
+
+
+if __name__ == "__main__":
+    main()