for extendability: renamed module to armed_bandits, created Greedy superclass of EpsilonGreedy, and implemented option to weigh reward based on recency

bing-j · marcharper · commit 2a9dfaf3de4c · 2024-07-12T14:39:32.000-07:00
diff --git a/axelrod/data/all_classifiers.yml b/axelrod/data/all_classifiers.yml
@@ -1999,3 +1999,12 @@ ZD-SET-2:
   manipulates_state: false
   memory_depth: 1
   stochastic: true
+greedy:
+  inspects_source: false
+  long_run_time: false
+  makes_use_of: !!set
+    game: null
+  manipulates_source: false
+  manipulates_state: false
+  memory_depth: .inf
+  stochastic: false
diff --git a/axelrod/strategies/_strategies.py b/axelrod/strategies/_strategies.py
@@ -90,7 +90,7 @@
 from .dbs import DBS
 from .defector import Defector, TrickyDefector
 from .doubler import Doubler
-from .epsilon_greedy import EpsilonGreedy
+from .armed_bandits import Greedy, EpsilonGreedy
 from .finite_state_machines import (
     TF1,
     TF2,
@@ -378,6 +378,7 @@
     Golden,
     Gradual,
     GradualKiller,
+    Greedy,
     Grudger,
     GrudgerAlternator,
     Grumpy,
diff --git a/axelrod/strategies/armed_bandits.py b/axelrod/strategies/armed_bandits.py
@@ -0,0 +1,151 @@
+import math
+
+from axelrod.action import Action
+from axelrod.player import Player
+
+C, D = Action.C, Action.D
+
+
+class Greedy(Player):
+    """
+    A player that always chooses the optimal action based on the average reward of each action from previous turns.
+
+    If initial rewards for each action are equivalent (true by default),
+    then the optimal action for the first turn is cooperate.
+
+    Names:
+
+    - Greedy: [Sutton2018]_
+    """
+
+    name = "greedy"
+    classifier = {
+        "memory_depth": float("inf"),
+        "stochastic": False,
+        "long_run_time": False,
+        "inspects_source": False,
+        "manipulates_source": False,
+        "manipulates_state": False,
+    }
+
+    UNIFORM = float("-inf")  # constant that replaces weight when rewards aren't weighted
+
+    def __init__(
+        self,
+        init_c_reward: float = 0.0,
+        init_d_reward: float = 0.0,
+        recency_weight: float = UNIFORM
+    ) -> None:
+        """
+        Parameters
+        ----------
+        init_c_reward
+            Initial expected utility from action C; defaults to 0.0.
+        init_d_reward
+            Initial expected utility from action D; defaults to 0.0
+        recency_weight
+            0.0 <= recency_weight <= 1.0
+            The exponential recency weight used in calculating the average reward.
+            If this argument is not provided, the player will not weigh rewards based on recency.
+        """
+        super().__init__()
+        self._rewards = {C: init_c_reward, D: init_d_reward}
+        self.weight = recency_weight
+
+        # treat out of range values as extremes
+        if (not math.isinf(self.weight)) and (self.weight <= 0):
+            self.weight = 0.0
+        if recency_weight >= 1:
+            self.weight = 1.0
+
+    def update_rewards(self, opponent: Player):
+        """Updates the expected reward associated with the last action."""
+        game = self.match_attributes["game"]
+        last_round = (self.history[-1], opponent.history[-1])
+        last_play = self.history[-1]
+        last_score = game.score(last_round)[0]
+
+        # if UNIFORM, use 1 / total number of times the updated action was taken previously
+        if math.isinf(self.weight):
+            weight = self.history.cooperations if last_play == C else self.defections
+        else:
+            weight = self.weight
+
+        self._rewards[last_play] = self._rewards[last_play] + weight * (
+            last_score - self._rewards[last_play]
+        )
+
+    def strategy(self, opponent: Player) -> Action:
+        """Actual strategy definition that determines player's action."""
+        # if not the first turn
+        if len(self.history) != 0:
+            self.update_rewards(opponent)
+
+        # select the optimal play
+        return max(self._rewards, key=self._rewards.get)
+
+
+class EpsilonGreedy(Greedy):
+    """
+    Has a 1 - epsilon probability of behaving like Greedy(), and plays randomly otherwise.
+
+    Names:
+
+    - Epsilon-greedy: [Sutton2018]_
+    """
+
+    name = "$\varepsilon$-greedy"
+    classifier = {
+        "memory_depth": float("inf"),
+        "stochastic": True,
+        "long_run_time": False,
+        "inspects_source": False,
+        "manipulates_source": False,
+        "manipulates_state": False,
+    }
+
+    def __init__(
+        self,
+        epsilon: float = 0.1,
+        init_c_reward: float = 0.0,
+        init_d_reward: float = 0.0,
+    ) -> None:
+        """
+        Parameters
+        ----------
+        epsilon
+            0.0 <= epsilon <= 1.0
+            the probability that the player will "explore" (act uniformly random); defaults to 0.1
+        init_c_reward
+            initial expected utility from action C; defaults to 0.0.
+        init_d_reward
+            initial expected utility from action D; defaults to 0.0
+
+        Special cases
+        ----------
+            When epsilon <= 0, this player behaves like Random(0.5)
+            When epsilon >= 1, this player behaves like Greedy()
+        """
+        super().__init__(init_c_reward, init_d_reward)
+        self.epsilon = epsilon
+
+        # treat out of range values as extremes
+        if epsilon <= 0:
+            self.epsilon = 0.0
+        if epsilon >= 1:
+            self.epsilon = 1.0
+
+    def _post_init(self):
+        super()._post_init()
+        if self.epsilon == 0:
+            self.classifier["stochastic"] = False
+
+    def strategy(self, opponent: Player) -> Action:
+        """Actual strategy definition that determines player's action."""
+
+        # explore
+        if self.epsilon > 0 and self._random.uniform(0.0, 1.0) <= self.epsilon:
+            return self._random.random_choice()
+        # exploit
+        else:
+            return super().strategy(opponent)
diff --git a/axelrod/strategies/epsilon_greedy.py b/axelrod/strategies/epsilon_greedy.py
diff --git a/axelrod/tests/strategies/test_armed_bandits.py b/axelrod/tests/strategies/test_armed_bandits.py
diff --git a/docs/reference/strategy_index.rst b/docs/reference/strategy_index.rst
@@ -18,6 +18,8 @@ Here are the docstrings of all the strategies in the library.
    :members:
 .. automodule:: axelrod.strategies.appeaser
    :members:
+.. automodule:: axelrod.strategies.armed_bandits
+   :members:
 .. automodule:: axelrod.strategies.averagecopier
    :members:
 .. automodule:: axelrod.strategies.axelrod_first
@@ -48,8 +50,6 @@ Here are the docstrings of all the strategies in the library.
    :members:
 .. automodule:: axelrod.strategies.forgiver
    :members:
-.. automodule:: axelrod.strategies.epsilon_greedy
-   :members:
 .. automodule:: axelrod.strategies.gambler
    :members:
 .. automodule:: axelrod.strategies.gobymajority