@@ -10,7 +10,7 @@ python3 -m examples.solver_judge.train_solver_judge_flow \
1010 data.train_batch_size=64 \
1111 data.max_prompt_length=2048 \
1212 data.max_response_length=1024 \
13- actor_rollout_ref.model.path=Qwen/Qwen3-0.6B \
13+ actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 \
1414 actor_rollout_ref.actor.optim.lr=1e-6 \
1515 actor_rollout_ref.model.use_remove_padding=True \
1616 actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-mean \
@@ -31,12 +31,13 @@ python3 -m examples.solver_judge.train_solver_judge_flow \
3131 actor_rollout_ref.rollout.name=vllm \
3232 actor_rollout_ref.rollout.mode=" async" \
3333 actor_rollout_ref.rollout.enforce_eager=False \
34- actor_rollout_ref.rollout.temperature=0.6 \
34+ actor_rollout_ref.rollout.temperature=1.0 \
35+ actor_rollout_ref.rollout.top_p=1.0 \
3536 actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
3637 actor_rollout_ref.rollout.n=4 \
3738 actor_rollout_ref.rollout.val_kwargs.n=1 \
38- actor_rollout_ref.rollout.val_kwargs.temperature=0.6 \
39- actor_rollout_ref.rollout.val_kwargs.top_p=0.95 \
39+ actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
40+ actor_rollout_ref.rollout.val_kwargs.top_p=1.0 \
4041 actor_rollout_ref.ref.fsdp_config.param_offload=True \
4142 algorithm.adv_estimator=grpo \
4243 rllm.compact_filtering.enable=False \
@@ -59,6 +60,7 @@ python3 -m examples.solver_judge.train_solver_judge_flow \
5960 trainer.test_freq=10 \
6061 trainer.default_hdfs_dir=null \
6162 trainer.total_epochs=100 \
62- rllm.workflow.use_workflow=True
63+ rllm.workflow.use_workflow=True \
64+ +ray_init._temp_dir=/home/tianhao/tmp
6365
6466pkill -9 -f ' ray::WorkerDict'
0 commit comments