|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Solve a coding task with a hosted LLM via Hugging Face Inference. |
| 3 | +
|
| 4 | +This script mirrors ``textarena_wordle_inference.py`` but targets the Coding |
| 5 | +environment. It launches the CodingEnv Docker image locally and asks an |
| 6 | +OpenAI-compatible model served through Hugging Face's router to iteratively |
| 7 | +produce Python code until the task is solved. |
| 8 | +
|
| 9 | +Prerequisites |
| 10 | +------------- |
| 11 | +1. Build the Coding environment Docker image:: |
| 12 | +
|
| 13 | + docker build \ |
| 14 | + -f src/envs/coding_env/server/Dockerfile \ |
| 15 | + -t coding-env:latest . |
| 16 | +
|
| 17 | +2. Set your Hugging Face token, or any other API key that is compatible with the OpenAI API: |
| 18 | +
|
| 19 | + export HF_TOKEN=your_token_here |
| 20 | + export API_KEY=your_api_key_here |
| 21 | +
|
| 22 | +3. Run the script:: |
| 23 | +
|
| 24 | + python examples/coding_env_inference.py |
| 25 | +
|
| 26 | +The script keeps sending execution feedback to the model until it prints |
| 27 | +``Result: 338350`` or reaches the configured step limit. |
| 28 | +""" |
| 29 | + |
| 30 | +from __future__ import annotations |
| 31 | + |
| 32 | +import os |
| 33 | +import re |
| 34 | +from typing import List, Tuple |
| 35 | + |
| 36 | +from openai import OpenAI |
| 37 | + |
| 38 | +from envs.coding_env import CodeAction, CodingEnv |
| 39 | + |
| 40 | + |
| 41 | +# --------------------------------------------------------------------------- |
| 42 | +# Configuration |
| 43 | +# --------------------------------------------------------------------------- |
| 44 | + |
| 45 | +API_BASE_URL = "https://router.huggingface.co/v1" |
| 46 | +API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN") |
| 47 | + |
| 48 | +MODEL = "openai/gpt-oss-120b:novita" |
| 49 | +MAX_STEPS = 5 |
| 50 | +VERBOSE = True |
| 51 | + |
| 52 | +CODING_TASK = ( |
| 53 | + "Write Python code that prints the sum of squares of the integers from 1 " |
| 54 | + "to 100 inclusive. The final line must be exactly `Result: <value>` with " |
| 55 | + "the correct number substituted." |
| 56 | +) |
| 57 | +EXPECTED_SUBSTRING = "Result: 338350" |
| 58 | + |
| 59 | +SYSTEM_PROMPT = ( |
| 60 | + "You are an expert Python programmer. Respond with valid Python code that " |
| 61 | + "solves the user's task. Always wrap your final answer in a fenced code " |
| 62 | + "block starting with ```python. Provide a complete script that can be " |
| 63 | + "executed as-is, with no commentary outside the code block." |
| 64 | +) |
| 65 | + |
| 66 | + |
| 67 | +# --------------------------------------------------------------------------- |
| 68 | +# Helpers |
| 69 | +# --------------------------------------------------------------------------- |
| 70 | + |
| 71 | +def extract_python_code(text: str) -> str: |
| 72 | + """Extract the first Python code block from the model output.""" |
| 73 | + |
| 74 | + code_blocks = re.findall( |
| 75 | + r"```(?:python)?\s*(.*?)```", |
| 76 | + text, |
| 77 | + re.IGNORECASE | re.DOTALL, |
| 78 | + ) |
| 79 | + if code_blocks: |
| 80 | + return code_blocks[0].strip() |
| 81 | + return text.strip() |
| 82 | + |
| 83 | + |
| 84 | +def format_feedback( |
| 85 | + step: int, |
| 86 | + stdout: str, |
| 87 | + stderr: str, |
| 88 | + exit_code: int, |
| 89 | +) -> str: |
| 90 | + """Generate feedback text describing the previous execution.""" |
| 91 | + |
| 92 | + stdout_display = stdout if stdout.strip() else "<empty>" |
| 93 | + stderr_display = stderr if stderr.strip() else "<empty>" |
| 94 | + return ( |
| 95 | + f"Execution feedback for step {step}:\n" |
| 96 | + f"exit_code={exit_code}\n" |
| 97 | + f"stdout:\n{stdout_display}\n" |
| 98 | + f"stderr:\n{stderr_display}\n" |
| 99 | + "If the task is not solved, return an improved Python script." |
| 100 | + ) |
| 101 | + |
| 102 | + |
| 103 | +def build_initial_prompt(task: str) -> str: |
| 104 | + """Construct the first user prompt for the coding task.""" |
| 105 | + |
| 106 | + return ( |
| 107 | + "You must write Python code to satisfy the following task. " |
| 108 | + "When executed, your script should behave exactly as described.\n\n" |
| 109 | + f"Task:\n{task}\n\n" |
| 110 | + "Reply with the full script in a single ```python code block." |
| 111 | + ) |
| 112 | + |
| 113 | + |
| 114 | +# --------------------------------------------------------------------------- |
| 115 | +# Gameplay |
| 116 | +# --------------------------------------------------------------------------- |
| 117 | + |
| 118 | +def solve_coding_task( |
| 119 | + env: CodingEnv, |
| 120 | + client: OpenAI, |
| 121 | +) -> Tuple[bool, List[str]]: |
| 122 | + """Iteratively ask the model for code until the task is solved.""" |
| 123 | + |
| 124 | + history = [ |
| 125 | + {"role": "system", "content": SYSTEM_PROMPT}, |
| 126 | + {"role": "user", "content": build_initial_prompt(CODING_TASK)}, |
| 127 | + ] |
| 128 | + |
| 129 | + obs = env.reset().observation |
| 130 | + |
| 131 | + transcripts: List[str] = [] |
| 132 | + |
| 133 | + for step in range(1, MAX_STEPS + 1): |
| 134 | + response = client.chat.completions.create( |
| 135 | + model=MODEL, |
| 136 | + messages=history, |
| 137 | + max_tokens=2048, |
| 138 | + temperature=0.2, |
| 139 | + ) |
| 140 | + |
| 141 | + assistant_message = response.choices[0].message.content.strip() |
| 142 | + history.append({"role": "assistant", "content": assistant_message}) |
| 143 | + |
| 144 | + code = extract_python_code(assistant_message) |
| 145 | + |
| 146 | + if VERBOSE: |
| 147 | + print(f"\n🛠️ Step {step}: executing model-produced code") |
| 148 | + print(code) |
| 149 | + |
| 150 | + result = env.step(CodeAction(code=code)) |
| 151 | + obs = result.observation |
| 152 | + |
| 153 | + transcripts.append( |
| 154 | + ( |
| 155 | + f"Step {step} | exit_code={obs.exit_code}\n" |
| 156 | + f"stdout:\n{obs.stdout}\n" |
| 157 | + f"stderr:\n{obs.stderr}\n" |
| 158 | + ) |
| 159 | + ) |
| 160 | + |
| 161 | + if VERBOSE: |
| 162 | + print(" ▶ exit_code:", obs.exit_code) |
| 163 | + if obs.stdout: |
| 164 | + print(" ▶ stdout:\n" + obs.stdout) |
| 165 | + if obs.stderr: |
| 166 | + print(" ▶ stderr:\n" + obs.stderr) |
| 167 | + |
| 168 | + solved = obs.exit_code == 0 and EXPECTED_SUBSTRING in obs.stdout |
| 169 | + if solved: |
| 170 | + return True, transcripts |
| 171 | + |
| 172 | + history.append( |
| 173 | + { |
| 174 | + "role": "user", |
| 175 | + "content": format_feedback( |
| 176 | + step, |
| 177 | + obs.stdout, |
| 178 | + obs.stderr, |
| 179 | + obs.exit_code, |
| 180 | + ), |
| 181 | + } |
| 182 | + ) |
| 183 | + |
| 184 | + # Keep conversation history compact to avoid exceeding context limits |
| 185 | + if len(history) > 20: |
| 186 | + history = [history[0]] + history[-19:] |
| 187 | + |
| 188 | + return False, transcripts |
| 189 | + |
| 190 | + |
| 191 | +# --------------------------------------------------------------------------- |
| 192 | +# Entrypoint |
| 193 | +# --------------------------------------------------------------------------- |
| 194 | + |
| 195 | +def main() -> None: |
| 196 | + if not API_KEY: |
| 197 | + raise SystemExit( |
| 198 | + "HF_TOKEN (or API_KEY) must be set to query the model." |
| 199 | + ) |
| 200 | + |
| 201 | + client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) |
| 202 | + |
| 203 | + env = CodingEnv.from_docker_image( |
| 204 | + "coding-env:latest", |
| 205 | + ports={8000: 8000}, |
| 206 | + ) |
| 207 | + |
| 208 | + try: |
| 209 | + success, transcripts = solve_coding_task(env, client) |
| 210 | + finally: |
| 211 | + env.close() |
| 212 | + |
| 213 | + print( |
| 214 | + "\n✅ Session complete" |
| 215 | + if success |
| 216 | + else "\n⚠️ Session finished without solving the task" |
| 217 | + ) |
| 218 | + print("--- Execution transcripts ---") |
| 219 | + for entry in transcripts: |
| 220 | + print(entry) |
| 221 | + |
| 222 | + |
| 223 | +if __name__ == "__main__": |
| 224 | + main() |
| 225 | + |
| 226 | + |
0 commit comments