|
| 1 | +"""BrowserGym MiniWoB example with Qwen deciding the next action. |
| 2 | +
|
| 3 | +This is an inference example for the BrowserGym environment. It uses the OpenAI |
| 4 | +client and a vision language model to decide the next action. We use Hugging Face |
| 5 | +Inference Providers API to access the model, but you can use any other provider that |
| 6 | +is compatible with the OpenAI API. |
| 7 | +
|
| 8 | +Prerequisites: |
| 9 | +- Clone the MiniWoB++ tasks repository. |
| 10 | +- Serve the HTML bundle with `python -m http.server 8888` inside the |
| 11 | + `miniwob-plusplus/miniwob/html` directory. |
| 12 | +- Export the MiniWoB URL (must include the `/miniwob/` suffix): |
| 13 | + `export MINIWOB_URL=http://host.docker.internal:8888/miniwob/` |
| 14 | +- Export your Hugging Face token for the router: |
| 15 | + `export HF_TOKEN=your_token_here` |
| 16 | +
|
| 17 | +Usage: |
| 18 | + python examples/browsergym_example.py |
| 19 | +""" |
| 20 | + |
| 21 | +import os |
| 22 | +import re |
| 23 | +import base64 |
| 24 | +import textwrap |
| 25 | +from io import BytesIO |
| 26 | +from typing import List, Optional, Dict |
| 27 | + |
| 28 | +from openai import OpenAI |
| 29 | +import numpy as np |
| 30 | +from PIL import Image |
| 31 | + |
| 32 | +from envs.browsergym_env import BrowserGymAction, BrowserGymEnv |
| 33 | + |
| 34 | +API_BASE_URL = "https://router.huggingface.co/v1" |
| 35 | +API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") |
| 36 | +MODEL_NAME = "Qwen/Qwen3-VL-30B-A3B-Instruct:novita" |
| 37 | +MAX_STEPS = 8 |
| 38 | +MAX_DOM_CHARS = 3500 |
| 39 | +TEMPERATURE = 0.2 |
| 40 | +MAX_TOKENS = 200 |
| 41 | +FALLBACK_ACTION = "noop()" |
| 42 | + |
| 43 | +DEBUG = True |
| 44 | +ACTION_PREFIX_RE = re.compile( |
| 45 | + r"^(action|next action)\s*[:\-]\s*", |
| 46 | + re.IGNORECASE, |
| 47 | +) |
| 48 | +ACTION_PATTERN = re.compile(r"[A-Za-z_]+\s*\(.*\)", re.DOTALL) |
| 49 | + |
| 50 | + |
| 51 | +SYSTEM_PROMPT = textwrap.dedent( |
| 52 | + """ |
| 53 | + You control a web browser through BrowserGym. |
| 54 | + Reply with exactly one action string. |
| 55 | + The action must be a valid BrowserGym command such as: |
| 56 | + - noop() |
| 57 | + - click('<BID>') |
| 58 | + - type('selector', 'text to enter') |
| 59 | + - fill('selector', 'text to enter') |
| 60 | + - send_keys('Enter') |
| 61 | + - scroll('down') |
| 62 | + Use single quotes around string arguments. |
| 63 | + When clicking, use the BrowserGym element IDs (BIDs) listed in the user message. |
| 64 | + If you are unsure, respond with noop(). |
| 65 | + Do not include explanations or additional text. |
| 66 | + """ |
| 67 | +).strip() |
| 68 | + |
| 69 | + |
| 70 | +def build_history_lines(history: List[str]) -> str: |
| 71 | + if not history: |
| 72 | + return "None" |
| 73 | + return "\n".join(history[-4:]) |
| 74 | + |
| 75 | + |
| 76 | +def extract_screenshot_uri(observation) -> Optional[str]: |
| 77 | + if observation.screenshot is None: |
| 78 | + return None |
| 79 | + screen_array = np.array(observation.screenshot, dtype=np.uint8) |
| 80 | + image = Image.fromarray(screen_array) |
| 81 | + buffer = BytesIO() |
| 82 | + image.save(buffer, format="PNG") |
| 83 | + buffer.seek(0) |
| 84 | + data_uri = base64.b64encode(buffer.read()).decode("utf-8") |
| 85 | + return f"data:image/png;base64,{data_uri}" |
| 86 | + |
| 87 | + |
| 88 | +def extract_clickable_elements(observation) -> List[Dict[str, str]]: |
| 89 | + """Collect BrowserGym element IDs that can be clicked.""" |
| 90 | + |
| 91 | + metadata = getattr(observation, "metadata", {}) or {} |
| 92 | + obs_dict = metadata.get("browsergym_obs", {}) or {} |
| 93 | + extra_props = obs_dict.get("extra_element_properties", {}) or {} |
| 94 | + |
| 95 | + clickables: List[Dict[str, str]] = [] |
| 96 | + for bid, props in extra_props.items(): |
| 97 | + if not props.get("clickable"): |
| 98 | + continue |
| 99 | + |
| 100 | + bbox = props.get("bbox") or [] |
| 101 | + bbox_str = ", ".join(bbox) if bbox else "?" |
| 102 | + clickables.append({ |
| 103 | + "bid": str(bid), |
| 104 | + "bbox": bbox_str, |
| 105 | + }) |
| 106 | + |
| 107 | + # Keep a stable ordering for readability |
| 108 | + clickables.sort(key=lambda item: item["bid"]) |
| 109 | + return clickables |
| 110 | + |
| 111 | + |
| 112 | +def build_user_prompt(step: int, observation, history: List[str]) -> str: |
| 113 | + goal = observation.goal or "(not provided)" |
| 114 | + url = observation.url or "(unknown)" |
| 115 | + error_note = "Yes" if observation.last_action_error else "No" |
| 116 | + |
| 117 | + clickables = extract_clickable_elements(observation) |
| 118 | + if clickables: |
| 119 | + actions_hint = "\n".join( |
| 120 | + f" - {item['bid']} (bbox: {item['bbox']})" for item in clickables |
| 121 | + ) |
| 122 | + else: |
| 123 | + actions_hint = " (none detected)" |
| 124 | + |
| 125 | + prompt = textwrap.dedent( |
| 126 | + f""" |
| 127 | + Step: {step} |
| 128 | + Goal: {goal} |
| 129 | + Current URL: {url} |
| 130 | + Previous steps: |
| 131 | + {build_history_lines(history)} |
| 132 | + Last action error: {error_note} |
| 133 | +
|
| 134 | + Available clickable element IDs: {actions_hint} |
| 135 | +
|
| 136 | + Reply with exactly one BrowserGym action string. |
| 137 | + """ |
| 138 | + ).strip() |
| 139 | + return prompt |
| 140 | + |
| 141 | + |
| 142 | +def parse_model_action(response_text: str) -> str: |
| 143 | + if not response_text: |
| 144 | + return FALLBACK_ACTION |
| 145 | + |
| 146 | + # Prefer the first line that looks like an action string |
| 147 | + lines = response_text.splitlines() |
| 148 | + for raw_line in lines: |
| 149 | + line = raw_line.strip() |
| 150 | + if not line: |
| 151 | + continue |
| 152 | + line = ACTION_PREFIX_RE.sub("", line) |
| 153 | + match = ACTION_PATTERN.search(line) |
| 154 | + if match: |
| 155 | + action = match.group(0).strip() |
| 156 | + # Collapse internal whitespace |
| 157 | + action = re.sub(r"\s+", " ", action) |
| 158 | + # If the model tried to click by natural-language description while we |
| 159 | + # only exposed numeric BrowserGym IDs, fallback to the single detected ID. |
| 160 | + return action |
| 161 | + |
| 162 | + # Fall back to searching the whole response |
| 163 | + match = ACTION_PATTERN.search(response_text) |
| 164 | + if match: |
| 165 | + action = match.group(0).strip() |
| 166 | + action = re.sub(r"\s+", " ", action) |
| 167 | + return action |
| 168 | + |
| 169 | + return FALLBACK_ACTION |
| 170 | + |
| 171 | + |
| 172 | +def main() -> None: |
| 173 | + |
| 174 | + client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) |
| 175 | + |
| 176 | + env = BrowserGymEnv.from_hub( |
| 177 | + "browsergym-env:latest", |
| 178 | + env_vars={ |
| 179 | + "BROWSERGYM_BENCHMARK": "miniwob", |
| 180 | + "BROWSERGYM_TASK_NAME": "click-test", |
| 181 | + }, |
| 182 | + ports={8000: 8000}, |
| 183 | + ) |
| 184 | + |
| 185 | + history: List[str] = [] |
| 186 | + |
| 187 | + try: |
| 188 | + result = env.reset() |
| 189 | + observation = result.observation |
| 190 | + print(f"Episode goal: {observation.goal}") |
| 191 | + |
| 192 | + for step in range(1, MAX_STEPS + 1): |
| 193 | + if result.done: |
| 194 | + print("Environment signalled done. Stopping early.") |
| 195 | + break |
| 196 | + |
| 197 | + user_prompt = build_user_prompt(step, observation, history) |
| 198 | + user_content = [{"type": "text", "text": user_prompt}] |
| 199 | + screenshot_uri = extract_screenshot_uri(observation) |
| 200 | + if screenshot_uri: |
| 201 | + user_content.append( |
| 202 | + { |
| 203 | + "type": "image_url", |
| 204 | + "image_url": {"url": screenshot_uri}, |
| 205 | + } |
| 206 | + ) |
| 207 | + |
| 208 | + messages = [ |
| 209 | + { |
| 210 | + "role": "system", |
| 211 | + "content": [{"type": "text", "text": SYSTEM_PROMPT}], |
| 212 | + }, |
| 213 | + { |
| 214 | + "role": "user", |
| 215 | + "content": user_content, |
| 216 | + }, |
| 217 | + ] |
| 218 | + |
| 219 | + try: |
| 220 | + completion = client.chat.completions.create( |
| 221 | + model=MODEL_NAME, |
| 222 | + messages=messages, |
| 223 | + temperature=TEMPERATURE, |
| 224 | + max_tokens=MAX_TOKENS, |
| 225 | + stream=False, |
| 226 | + ) |
| 227 | + response_text = completion.choices[0].message.content or "" |
| 228 | + # pylint: disable=broad-except |
| 229 | + except Exception as exc: # noqa: BLE001 |
| 230 | + failure_msg = ( |
| 231 | + f"Model request failed ({exc}). Using fallback action." |
| 232 | + ) |
| 233 | + print(failure_msg) |
| 234 | + response_text = FALLBACK_ACTION |
| 235 | + |
| 236 | + action_str = parse_model_action(response_text) |
| 237 | + print(f"Step {step}: model suggested -> {action_str}") |
| 238 | + |
| 239 | + result = env.step(BrowserGymAction(action_str=action_str)) |
| 240 | + observation = result.observation |
| 241 | + |
| 242 | + reward = result.reward or 0.0 |
| 243 | + error_flag = " ERROR" if observation.last_action_error else "" |
| 244 | + history_line = ( |
| 245 | + f"Step {step}: {action_str} -> reward {reward:+.2f}" |
| 246 | + f"{error_flag}" |
| 247 | + ) |
| 248 | + history.append(history_line) |
| 249 | + print( |
| 250 | + " Reward: " |
| 251 | + f"{reward:+.2f} | Done: {result.done} | Last action error: " |
| 252 | + f"{observation.last_action_error}" |
| 253 | + ) |
| 254 | + |
| 255 | + if result.done: |
| 256 | + print("Episode complete.") |
| 257 | + break |
| 258 | + |
| 259 | + else: |
| 260 | + print(f"Reached max steps ({MAX_STEPS}).") |
| 261 | + |
| 262 | + finally: |
| 263 | + env.close() |
| 264 | + |
| 265 | + |
| 266 | +if __name__ == "__main__": |
| 267 | + main() |
0 commit comments