fix LLMObservationWrapper accumulate observations in self.full_observations across resets

kashif · kashif · commit 6876aff52c7f · 2025-11-07T21:50:36.000+01:00
diff --git a/src/envs/textarena_env/server/environment.py b/src/envs/textarena_env/server/environment.py
@@ -16,7 +16,12 @@
 
 from core.env_server.interfaces import Environment
 
-from ..models import TextArenaAction, TextArenaMessage, TextArenaObservation, TextArenaState
+from ..models import (
+    TextArenaAction,
+    TextArenaMessage,
+    TextArenaObservation,
+    TextArenaState,
+)
 from ..rewards import RewardProvider, build_reward_providers
 
 
@@ -92,6 +97,18 @@ def __init__(
     # Environment interface
     # ------------------------------------------------------------------
     def reset(self) -> TextArenaObservation:
+        # TextArena observation wrappers (LLMObservationWrapper, etc.) accumulate
+        # observations in self.full_observations across resets. Since we can't modify TextArena,
+        # we need to manually clear this state to prevent history accumulation.
+        env = self._ta_env
+        while hasattr(env, "env"):
+            if hasattr(env, "full_observations"):
+                env.full_observations = {}
+            env = env.env
+        # Also check the final unwrapped env
+        if hasattr(env, "full_observations"):
+            env.full_observations = {}
+
         self._ta_env.reset(num_players=self.num_players)
 
         for provider in self._reward_providers:
@@ -128,13 +145,18 @@ def step(self, action: TextArenaAction) -> TextArenaObservation:  # type: ignore
         observation.reward = reward
         self._state.last_reward = reward
 
-        reward_signals = self._compute_reward_signals(action=action, observation=observation)
+        reward_signals = self._compute_reward_signals(
+            action=action, observation=observation
+        )
         if reward_signals:
             observation.info.setdefault("reward_signals", {}).update(reward_signals)
             observation.metadata.setdefault("reward_signals", {}).update(reward_signals)
         self._last_reward_signals = reward_signals
         if reward_signals:
-            self._state.last_info = {**(self._state.last_info or {}), "reward_signals": reward_signals}
+            self._state.last_info = {
+                **(self._state.last_info or {}),
+                "reward_signals": reward_signals,
+            }
         self._state.raw_state = self._snapshot_state()
 
         return observation
@@ -150,16 +172,30 @@ def _build_observation(self) -> TextArenaObservation:
         player_id, messages = self._ta_env.get_observation()
 
         ta_messages = self._convert_messages(messages)
+
+        # Extract prompt from the appropriate messages.
+        # TextArena PROMPT type messages contain the game instructions added during reset.
+        # As a fallback for environments that don't use typed messages, use only the first
+        # message if we're at turn 0 (fresh reset).
         prompt_lines = [msg.content for msg in ta_messages if msg.category == "PROMPT"]
+
         if not prompt_lines:
-            # Fallback to most recent message history for prompt
-            prompt_lines = [msg.content for msg in ta_messages]
+            # Fallback: use the first message only if at turn 0 (just after reset)
+            # DO NOT use all messages as this causes history accumulation
+            current_turn = getattr(self._ta_env.state, "turn", 0)
+            if current_turn == 0 and ta_messages:
+                prompt_lines = [ta_messages[0].content]
+            else:
+                # Use env_id as final fallback to avoid including game history
+                prompt_lines = [self.env_id]
+
+        prompt = "\n".join(prompt_lines).strip()
 
         info: Dict[str, Any] = {}
         info.update(getattr(self._ta_env.state, "step_info", {}))
 
         observation = TextArenaObservation(
-            prompt="\n".join(prompt_lines).strip(),
+            prompt=prompt,
             messages=ta_messages,
             current_player_id=player_id,
             legal_players=self._legal_players(),
@@ -182,29 +218,31 @@ def _build_observation(self) -> TextArenaObservation:
 
     def _legal_players(self) -> List[int]:
         role_mapping = getattr(self._ta_env.state, "role_mapping", {}) or {}
-        players = [pid for pid in role_mapping.keys() if isinstance(pid, int) and pid >= 0]
+        players = [
+            pid for pid in role_mapping.keys() if isinstance(pid, int) and pid >= 0
+        ]
         return sorted(players)
 
     def _convert_messages(self, messages: Iterable[Any]) -> List[TextArenaMessage]:
         converted: List[TextArenaMessage] = []
-        buffered_content: List[str] = []
         buffered_sender: int | None = None
         buffered_category: str | None = None
-        last_char_was_newline = False
+        buffered_content: List[str] = []
 
         def flush_buffer() -> None:
             nonlocal buffered_content, buffered_sender, buffered_category
-            if buffered_content:
-                converted.append(
-                    TextArenaMessage(
-                        sender_id=buffered_sender if buffered_sender is not None else -1,
-                        content="".join(buffered_content),
-                        category=buffered_category or "MESSAGE",
-                    )
+            if not buffered_content:
+                return
+            converted.append(
+                TextArenaMessage(
+                    sender_id=buffered_sender if buffered_sender is not None else -1,
+                    content="".join(buffered_content),
+                    category=buffered_category or "MESSAGE",
                 )
+            )
             buffered_content = []
-            buffered_sender = None
             buffered_category = None
+            buffered_sender = None
 
         for entry in messages:
             if isinstance(entry, tuple) and len(entry) == 3:
@@ -219,29 +257,17 @@ def flush_buffer() -> None:
             sender_id = int(sender) if isinstance(sender, (int, float)) else -1
             text = str(content)
 
-            if text == "\n":
-                flush_buffer()
-                if last_char_was_newline:
-                    converted.append(
-                        TextArenaMessage(
-                            sender_id=sender_id,
-                            content="",
-                            category=category_name,
-                        )
-                    )
-                last_char_was_newline = True
-                continue
-
-            if buffered_sender is None or buffered_category is None:
-                buffered_sender = sender_id
-                buffered_category = category_name
-            elif buffered_sender != sender_id or buffered_category != category_name:
+            if (
+                buffered_content
+                and buffered_category == category_name
+                and buffered_sender == sender_id
+            ):
+                buffered_content.append(text)
+            else:
                 flush_buffer()
                 buffered_sender = sender_id
                 buffered_category = category_name
-
-            buffered_content.append(text)
-            last_char_was_newline = False
+                buffered_content = [text]
 
         flush_buffer()
 
diff --git a/tests/envs/test_textarena_environment.py b/tests/envs/test_textarena_environment.py
@@ -1,5 +1,5 @@
 from envs.textarena_env.server.environment import TextArenaEnvironment
-from envs.textarena_env.models import TextArenaMessage
+from envs.textarena_env.models import TextArenaMessage, TextArenaAction
 
 
 def test_convert_messages_coalesces_consecutive_characters():
@@ -23,42 +23,43 @@ def test_convert_messages_coalesces_consecutive_characters():
     ]
 
 
-def test_convert_messages_splits_on_newlines():
-    env = object.__new__(TextArenaEnvironment)
+def test_wordle_reset_clears_accumulated_state():
+    """Test that resetting Wordle environment clears accumulated observation state.
 
-    raw_messages = [
-        "[",
-        "G",
-        "A",
-        "M",
-        "E",
-        "]",
-        "\n",
-        "[",
-        "N",
-        "E",
-        "X",
-        "T",
-        "]",
-    ]
+    This test verifies the workaround for TextArena's LLMObservationWrapper,
+    which accumulates observations in self.full_observations across resets.
+    """
+    env = TextArenaEnvironment(
+        env_id="Wordle-v0",
+        num_players=1,
+    )
 
-    converted = env._convert_messages(raw_messages)
+    # First episode
+    obs1 = env.reset()
+    prompt1_len = len(obs1.prompt)
 
-    assert converted == [
-        TextArenaMessage(sender_id=-1, content="[GAME]", category="MESSAGE"),
-        TextArenaMessage(sender_id=-1, content="[NEXT]", category="MESSAGE"),
-    ]
+    # Make a move to accumulate some state
+    env.step(TextArenaAction(message="[CRANE]"))
 
+    # Second episode - should NOT accumulate from first episode
+    obs2 = env.reset()
+    prompt2_len = len(obs2.prompt)
 
-def test_convert_messages_preserves_blank_lines():
-    env = object.__new__(TextArenaEnvironment)
+    # Make another move
+    env.step(TextArenaAction(message="[STALE]"))
 
-    raw_messages = ["A", "\n", "\n", "B"]
+    # Third episode - should NOT accumulate from previous episodes
+    obs3 = env.reset()
+    prompt3_len = len(obs3.prompt)
 
-    converted = env._convert_messages(raw_messages)
+    # All prompts should be the same length (no accumulation)
+    assert prompt1_len == prompt2_len, (
+        f"Episode 2 accumulated state: {prompt1_len} -> {prompt2_len}"
+    )
+    assert prompt2_len == prompt3_len, (
+        f"Episode 3 accumulated state: {prompt2_len} -> {prompt3_len}"
+    )
 
-    assert converted == [
-        TextArenaMessage(sender_id=-1, content="A", category="MESSAGE"),
-        TextArenaMessage(sender_id=-1, content="", category="MESSAGE"),
-        TextArenaMessage(sender_id=-1, content="B", category="MESSAGE"),
-    ]
+    # Verify the prompts are actually the same content
+    assert obs1.prompt == obs2.prompt
+    assert obs2.prompt == obs3.prompt