Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions tinker_cookbook/recipes/code_rl/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Replicating DeepCoder with Tinker

Competitive programming problems are a common testbed for RL with LLMs. The recent [DeepCoder](https://pretty-radio-b75.notion.site/DeepCoder-A-Fully-Open-Source-14B-Coder-at-O3-mini-Level-1cf81902c14680b3bee5eb349a512a51) blog post introduces a dataset and training pipeline for this purpose. This recipe demonstrates a similar setup using `Qwen3-4B-Instruct-2507`.

## Running This Demo

### Sandboxing

Sandboxing is essential for safely executing generated code during training and evaluation. We use [Sandbox Fusion](https://bytedance.github.io/SandboxFusion/) to sandbox code execution. You can start a local sandbox in Docker with:

```bash
docker run -it -p 8080:8080 \
-v ${TINKER_COOKBOOK_ROOT}/tinker_cookbook/recipes/code_rl/sandbox_config/local.yaml:/root/sandbox/sandbox/configs/local.yaml \
volcengine/sandbox-fusion:server-20250609
```

Here, `${TINKER_COOKBOOK_ROOT}` is the absolute path to your local `tinker-cookbook` repository. The training script reads the sandbox endpoint from the `SANDBOX_URL` environment variable. By default it uses `http://localhost:8080/run_code`. Example:

```bash
export SANDBOX_URL=http://localhost:8080/run_code
```

If you prefer not to use Docker, you can set up the sandbox manually by following the instructions in the [Sandbox Fusion repository](https://github.com/bytedance/SandboxFusion?tab=readme-ov-file#installation).

### Example command

Train a `Qwen3-4B-Instruct-2507` model with:

```bash
python -m tinker_cookbook.recipes.code_rl.train \
model_name="Qwen/Qwen3-4B-Instruct-2507" \
group_size=8 groups_per_batch=128 \
learning_rate=4e-5 \
lora_rank=32 \
max_tokens=24576
```

After 100 steps of training, you can expect the following performance on **LiveCodeBench v6 (2025.02–2025.05)**:

| Model | Pass@1 | Pass@8 |
|-------|--------|--------|
| Qwen3-4B-Instruct-2507 (before training) | 33.8% | 44.3% |
| Qwen3-4B-Instruct-2507 (after 100 steps) | 42.7% | 55.0% |

[1] Luo, M., Tan, S., Huang, R., Patel, A., Ariyak, A., Wu, Q., Shi, X., Xin, R., Cai, C., Weber, M., Zhang, C., Li, L. E., Popa, R. A., & Stoica, I. (2025). DeepCoder: A fully open-source 14B coder at O3-mini level.
283 changes: 283 additions & 0 deletions tinker_cookbook/recipes/code_rl/code_env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
import json
from functools import partial
from typing import Any, Literal, Sequence, cast

import chz
from datasets import Dataset, concatenate_datasets, load_dataset

import tinker
from tinker_cookbook.recipes.code_rl.code_grading import (
extract_code_from_model,
sandbox_check_correctness,
taco_to_lcb_format,
)
from tinker_cookbook.recipes.code_rl.lcb_utils import fetch_live_code_bench_system_prompt
from tinker_cookbook import renderers
from tinker_cookbook.rl.problem_env import ProblemEnv, ProblemGroupBuilder, logger
from tinker_cookbook.rl.types import (
Action,
EnvGroupBuilder,
RLDataset,
RLDatasetBuilder,
StepResult,
)
from tinker_cookbook.tokenizer_utils import get_tokenizer
from tinker_cookbook.utils import logtree


def _load_deepcoder_split(split: Literal["train", "test"]) -> Dataset:
if split == "train":
datasets = [
cast(
Dataset,
load_dataset("agentica-org/DeepCoder-Preview-Dataset", name=name, split="train"),
)
for name in ("primeintellect", "taco", "lcbv5")
]
elif split == "test":
datasets = [
cast(
Dataset,
load_dataset("agentica-org/DeepCoder-Preview-Dataset", name=name, split="test"),
)
for name in ("codeforces", "lcbv5")
]
return cast(Dataset, concatenate_datasets(datasets))


def _ensure_dict(metadata: Any) -> dict[str, Any]:
if isinstance(metadata, str):
try:
metadata = json.loads(metadata)
except json.JSONDecodeError:
logger.warning("Failed to deserialize metadata: %s", metadata)
return {}
if isinstance(metadata, dict):
return metadata
return {}


def _normalize_tests(raw_tests: Any, metadata: dict[str, Any]) -> list[dict[str, Any]]:
tests = raw_tests
if isinstance(tests, str):
try:
tests = json.loads(tests)
except json.JSONDecodeError:
logger.warning("Failed to deserialize tests. Dropping sample.")
return []
if isinstance(tests, dict) and "inputs" in tests and "outputs" in tests:
tests = taco_to_lcb_format(tests)
if isinstance(tests, dict):
tests = [tests]

normalized: list[dict[str, Any]] = []
for test in tests or []:
if not isinstance(test, dict):
continue
testtype = test.get("testtype") or "stdin_stdout"
test_metadata = _ensure_dict(test.get("metadata", {}))
if testtype == "functional":
func_name = test_metadata.get("func_name") or metadata.get("func_name")
if func_name is not None:
test_metadata["func_name"] = str(func_name)
normalized.append(
{
"input": str(test.get("input", "")),
"output": str(test.get("output", "")),
"testtype": testtype,
"metadata": test_metadata or {"func_name": None},
}
)
return normalized


def _build_question(example: dict[str, Any]) -> str | None:
# Prefer preprocessed question if available.
question = example.get("question") or example.get("prompt") or example.get("problem")
if not isinstance(question, str) or not question.strip():
return None
starter_code = example.get("starter_code")
if isinstance(starter_code, str) and starter_code.strip():
return fetch_live_code_bench_system_prompt(question, starter_code)
return fetch_live_code_bench_system_prompt(question)


class CodeEnv(ProblemEnv):
def __init__(
self,
problem: str,
tests: list[dict[str, Any]],
renderer: renderers.Renderer,
convo_prefix: list[renderers.Message] | None = None,
format_coef: float = 0.1,
reward_timeout: int = 6,
):
super().__init__(renderer, convo_prefix, format_coef=format_coef)
self.problem = problem
self.tests = tests
self.reward_timeout = reward_timeout

def get_question(self) -> str:
return self.problem

def check_format(self, sample_str: str) -> bool:
return extract_code_from_model(sample_str) is not None

def check_answer(self, sample_str: str) -> bool:
"""Not used - CodeEnv uses async check_sandbox_correctness instead."""
return False

async def check_sandbox_correctness(self, sample_str: str) -> bool:
"""Check if the code passes all test cases using sandbox execution."""
code = extract_code_from_model(sample_str)
if code is None:
logtree.log_text("No code block detected in response.")
return False

try:
success, details = await sandbox_check_correctness(
self.tests, code, timeout=self.reward_timeout
)
status = "✓" if success else "✗"
logtree.log_text(
f"Sandbox result {status}: {'All tests passed' if success else 'Failed'}"
)
return success
except Exception as exc:
logger.warning("Sandbox check failed: %s", exc, exc_info=True)
logtree.log_text(f"Sandbox check failed: {exc}")
return False

def get_reference_answer(self) -> str:
return ""

async def step(self, action: Action) -> StepResult:
message, parse_success = self.renderer.parse_response(action)
content = message["content"]
format_ok_bool = bool(parse_success) and self.check_format(content)
correct_answer_bool = await self.check_sandbox_correctness(content)
format_score = float(format_ok_bool)
correct_score = float(correct_answer_bool)
total_reward = self.format_coef * (format_score - 1.0) + correct_score

logtree.log_text(f"Problem: {self.get_question()}")
logtree.log_text(f"Response: {content}")
if reference := self.get_reference_answer():
logtree.log_text(f"Reference Answer: {reference}")
logtree.log_text(
f"Format Valid: {'✓' if format_ok_bool else '✗'}, "
f"Correct: {'✓' if correct_answer_bool else '✗'}, "
f"Reward: {total_reward:.2f}"
)

return StepResult(
reward=total_reward,
episode_done=True,
next_observation=tinker.ModelInput.empty(),
next_stop_condition=self.stop_condition,
metrics={
"format": format_score,
"correct": correct_score,
},
)


class DeepcoderDataset(RLDataset):
def __init__(
self,
batch_size: int,
group_size: int,
renderer: renderers.Renderer,
convo_prefix: list[renderers.Message] | None = None,
split: Literal["train", "test"] = "train",
seed: int = 0,
format_coef: float = 0.1,
reward_timeout: int = 6,
):
self.ds = _load_deepcoder_split(split)
if split == "train":
self.ds = self.ds.shuffle(seed=seed)
self.batch_size = batch_size
self.group_size = group_size if split == "train" else 1
self.renderer = renderer
self.convo_prefix = convo_prefix
self.format_coef = format_coef
self.reward_timeout = reward_timeout

def __len__(self) -> int:
return (len(self.ds) + self.batch_size - 1) // self.batch_size

def get_batch(self, index: int) -> Sequence[EnvGroupBuilder]:
start = index * self.batch_size
end = min((index + 1) * self.batch_size, len(self.ds))
if start >= end:
raise IndexError("Incorrect batch index for DeepcoderDataset")
builders: list[EnvGroupBuilder] = []
for row in self.ds.select(range(start, end)):
builder = self._make_env_group_builder(cast(dict[str, Any], row), self.group_size)
if builder is not None:
builders.append(builder)
return builders

def _make_env_group_builder(
self, example: dict[str, Any], group_size: int
) -> ProblemGroupBuilder | None:
metadata = _ensure_dict(example.get("metadata", {}))
tests = _normalize_tests(example.get("tests") or example.get("ground_truth"), metadata)
if not tests:
logger.warning("Skipping sample without valid tests.")
return None
question = _build_question(example)
if question is None:
logger.warning("Skipping sample without question text.")
return None
return ProblemGroupBuilder(
env_thunk=partial(
CodeEnv,
question,
tests,
self.renderer,
convo_prefix=self.convo_prefix,
format_coef=self.format_coef,
reward_timeout=self.reward_timeout,
),
num_envs=group_size,
dataset_name="deepcoder",
)


@chz.chz
class DeepcoderDatasetBuilder(RLDatasetBuilder):
batch_size: int
model_name_for_tokenizer: str
renderer_name: str
group_size: int
convo_prefix: list[renderers.Message] | None = None
seed: int = 0
format_coef: float = 0.1
reward_timeout: int = 6

async def __call__(self) -> tuple[DeepcoderDataset, DeepcoderDataset]:
tokenizer = get_tokenizer(self.model_name_for_tokenizer)
renderer = renderers.get_renderer(self.renderer_name, tokenizer=tokenizer)
train_ds = DeepcoderDataset(
batch_size=self.batch_size,
group_size=self.group_size,
renderer=renderer,
convo_prefix=self.convo_prefix,
split="train",
seed=self.seed,
format_coef=self.format_coef,
reward_timeout=self.reward_timeout,
)
test_ds = DeepcoderDataset(
batch_size=self.batch_size,
group_size=1,
renderer=renderer,
convo_prefix=self.convo_prefix,
split="test",
seed=self.seed,
format_coef=self.format_coef,
reward_timeout=self.reward_timeout,
)
return train_ds, test_ds
Loading