|
| 1 | +""" |
| 2 | +envs/coding_env/env.py |
| 3 | +-------------------------------- |
| 4 | +Concrete environment implementation using the core BaseEnv. |
| 5 | +POC implementation runs code locally via subprocess that can be changed later. |
| 6 | +""" |
| 7 | + |
| 8 | +from __future__ import annotations |
| 9 | + |
| 10 | +import subprocess |
| 11 | +from typing import Optional |
| 12 | + |
| 13 | +from core.base import BaseEnv |
| 14 | +from core.types import StepResult |
| 15 | + |
| 16 | +from .models import CodeAction, CodeObservation |
| 17 | + |
| 18 | + |
| 19 | +class CodingEnv(BaseEnv[CodeAction, CodeObservation]): |
| 20 | + """ |
| 21 | + Minimal Coding Environment. |
| 22 | +
|
| 23 | + POC behavior: |
| 24 | + - reset(): returns a fresh, empty observation (no persistent state). |
| 25 | + - step(action): runs Python code with `python -c` and returns stdout/stderr/exit_code. |
| 26 | +
|
| 27 | + Future swap: |
| 28 | + Replace _run_code_locally() with a call to your Docker/gateway backend without |
| 29 | + changing the public API. |
| 30 | + """ |
| 31 | + |
| 32 | + def __init__( |
| 33 | + self, |
| 34 | + default_timeout_s: float = 10.0, |
| 35 | + python_executable: str = "python", |
| 36 | + ): |
| 37 | + """ |
| 38 | + Args: |
| 39 | + default_timeout_s: Max seconds to allow code execution before timing out. |
| 40 | + python_executable: Interpreter to run (e.g., "python3", a venv path, etc.). |
| 41 | + """ |
| 42 | + self._default_timeout_s = float(default_timeout_s) |
| 43 | + self._python = python_executable |
| 44 | + |
| 45 | + # --- BaseEnv interface --- |
| 46 | + |
| 47 | + def reset(self) -> CodeObservation: |
| 48 | + # No state to clear in this POC; return an initial observation. |
| 49 | + return CodeObservation(stdout="", stderr="", exit_code=0) |
| 50 | + |
| 51 | + def step(self, action: CodeAction) -> StepResult[CodeObservation]: |
| 52 | + if not isinstance(action, CodeAction): |
| 53 | + raise TypeError(f"Expected CodeAction, got {type(action)!r}") |
| 54 | + |
| 55 | + # TODO: replace dummy response with the call to the code executor inside the container |
| 56 | + obs, timed_out = CodeObservation(stderr="", stdout="", exit_code=0), False |
| 57 | + |
| 58 | + # Simple reward heuristic: success and no stderr -> 1.0 else 0.0 |
| 59 | + reward: Optional[float] = ( |
| 60 | + 1.0 if (obs.exit_code == 0 and not obs.stderr) else 0.0 |
| 61 | + ) |
| 62 | + |
| 63 | + info = { |
| 64 | + "timed_out": timed_out, |
| 65 | + "interpreter": self._python, |
| 66 | + } |
| 67 | + |
| 68 | + return StepResult( |
| 69 | + observation=obs, |
| 70 | + reward=reward, |
| 71 | + done=False, # Coding env is not episodic by default |
| 72 | + ) |
0 commit comments