From cd234ab4fdc722dba19300b867b5fd15cfd5ad4f Mon Sep 17 00:00:00 2001 From: ymkymkymkymx <1435664939@qq.com> Date: Mon, 15 Jul 2019 10:51:14 -0400 Subject: [PATCH 01/56] Messing with mountain car --- agents_using_gym/gymMountainCarv0/README.md | 12 ++++ agents_using_gym/gymMountainCarv0/cheating.py | 13 ++++ .../gymMountainCarv0/simpleqlearning.py | 62 +++++++++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 agents_using_gym/gymMountainCarv0/README.md create mode 100644 agents_using_gym/gymMountainCarv0/cheating.py create mode 100644 agents_using_gym/gymMountainCarv0/simpleqlearning.py diff --git a/agents_using_gym/gymMountainCarv0/README.md b/agents_using_gym/gymMountainCarv0/README.md new file mode 100644 index 000000000..64aca326a --- /dev/null +++ b/agents_using_gym/gymMountainCarv0/README.md @@ -0,0 +1,12 @@ +## This folders incude some agents for gym's mountain car environment. +## The codes in this folder are using Python 3.6.1, gym==0.13.1,numpy==1.16.4. The codes are using some functions from gym==0.13.1 which are not implemented in gym==0.10.5, so please upgrade your gym before running these codes. +### If you don't know how to upgrade gym: +``` +pip uninstall gym +pip install gym +``` +## Python files +### These files are just using gym, and can be run by ```python filename.py``` or ```python3 filename.py``` if you are using linux. IDEs shold be able to run them as well. +### cheating.py is an straight solution by Mark Yu after 2 second of thinking about this game, it represents Mark's superiority against AI. JK. +### simpleqlearning.py is an implementation of qlearning that Mark learnt from wikipedia [https://en.wikipedia.org/wiki/Q-learning](https://en.wikipedia.org/wiki/Q-learning). Feel free to mess with the learning rate and discountrate and compare the time it takes for the car to reach the summit. + diff --git a/agents_using_gym/gymMountainCarv0/cheating.py b/agents_using_gym/gymMountainCarv0/cheating.py new file mode 100644 index 000000000..c67572d6c --- /dev/null +++ b/agents_using_gym/gymMountainCarv0/cheating.py @@ -0,0 +1,13 @@ +import gym +env = gym.make("MountainCar-v0") + +done=False +state=env.reset() +while not done: + if state[1]<=0: + state, reward, done,info = env.step(0) + else: + state, reward, done,info = env.step(2) + env.render() + +env.close() \ No newline at end of file diff --git a/agents_using_gym/gymMountainCarv0/simpleqlearning.py b/agents_using_gym/gymMountainCarv0/simpleqlearning.py new file mode 100644 index 000000000..f955d6e5c --- /dev/null +++ b/agents_using_gym/gymMountainCarv0/simpleqlearning.py @@ -0,0 +1,62 @@ +import gym +import numpy + +env = gym.make("MountainCar-v0") + +learningrate = 0.7 +discount = 0.90 +#initiallize the Q table [40,40,3] with random values. The meaning of the q table is the q value of a set of [state of positions,state of velocity, action you take]. +#Note that the game is continous but the states of our q table are discrete(since we can only deal with finite states), So I also need a getstate function to turn the continous states into deiscrete states. +#all q values are initialized between -2 and 0 because the reward is always -1 in the mountaincar game. +q_table = numpy.random.uniform(-2, 0, [40,40,3]) + + +def getstate(state): + discrete_state = (state - env.observation_space.low)/((env.observation_space.high-env.observation_space.low)/[40,40]) + return tuple(discrete_state.astype(numpy.int)) # we use this tuple to look up the 3 Q values for the available actions in the q-table + + +for episode in range(2700): + currentstate = getstate(env.reset()) + done = False + #render every 300 episodes to save time. + if episode % 300 == 0: + render = True + print(episode) + else: + render = False + + while not done: + action = numpy.argmax(q_table[currentstate]) + new_state, reward, done,info = env.step(action) + #nextstate is the discrete mapping from the new state to the q table + nextstate = getstate(new_state) + + if render: + env.render() + + # Update Q table + if not done: + # Maximum possible Q value in next step (for new state) + maxnextq = numpy.max(q_table[nextstate]) + # Current Q value (for current state and performed action) + current_q = q_table[currentstate + (action,)] + # the qlearning function + new_q = (1 - learningrate) * current_q + learningrate * (reward + discount * maxnextq) + # Update Q table with new Q value + q_table[currentstate + (action,)] = new_q + + + # Simulation ended (for any reson) - if goal position is achived - update Q value with reward directly + elif new_state[0] >= env.goal_position: + print("We make it!") + print(episode) + q_table[currentstate + (action,)] = 0 + + + currentstate = nextstate + + + + +env.close() \ No newline at end of file From 3703822df03cff68dcd5c6ec48ba1e17eb955d29 Mon Sep 17 00:00:00 2001 From: ymkymkymkymx <43044797+ymkymkymkymx@users.noreply.github.com> Date: Mon, 15 Jul 2019 10:53:33 -0400 Subject: [PATCH 02/56] Update README.md --- agents_using_gym/gymMountainCarv0/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agents_using_gym/gymMountainCarv0/README.md b/agents_using_gym/gymMountainCarv0/README.md index 64aca326a..a36f90fd1 100644 --- a/agents_using_gym/gymMountainCarv0/README.md +++ b/agents_using_gym/gymMountainCarv0/README.md @@ -6,7 +6,7 @@ pip uninstall gym pip install gym ``` ## Python files -### These files are just using gym, and can be run by ```python filename.py``` or ```python3 filename.py``` if you are using linux. IDEs shold be able to run them as well. +### These files are just using gym, and can be run by ```python filename.py``` (or ```python3 filename.py``` if you are using linux.) IDEs shold be able to run them as well. ### cheating.py is an straight solution by Mark Yu after 2 second of thinking about this game, it represents Mark's superiority against AI. JK. ### simpleqlearning.py is an implementation of qlearning that Mark learnt from wikipedia [https://en.wikipedia.org/wiki/Q-learning](https://en.wikipedia.org/wiki/Q-learning). Feel free to mess with the learning rate and discountrate and compare the time it takes for the car to reach the summit. From 6a5113c50582db60b02d685352a79138a588f0ba Mon Sep 17 00:00:00 2001 From: ymkymkymkymx <43044797+ymkymkymkymx@users.noreply.github.com> Date: Mon, 15 Jul 2019 10:56:15 -0400 Subject: [PATCH 03/56] Update README.md --- agents_using_gym/gymMountainCarv0/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agents_using_gym/gymMountainCarv0/README.md b/agents_using_gym/gymMountainCarv0/README.md index a36f90fd1..8a2475783 100644 --- a/agents_using_gym/gymMountainCarv0/README.md +++ b/agents_using_gym/gymMountainCarv0/README.md @@ -7,6 +7,6 @@ pip install gym ``` ## Python files ### These files are just using gym, and can be run by ```python filename.py``` (or ```python3 filename.py``` if you are using linux.) IDEs shold be able to run them as well. -### cheating.py is an straight solution by Mark Yu after 2 second of thinking about this game, it represents Mark's superiority against AI. JK. -### simpleqlearning.py is an implementation of qlearning that Mark learnt from wikipedia [https://en.wikipedia.org/wiki/Q-learning](https://en.wikipedia.org/wiki/Q-learning). Feel free to mess with the learning rate and discountrate and compare the time it takes for the car to reach the summit. +### cheating.py is a straight solution by Mark Yu after 2 seconds of thinking, it represents Mark's superiority against AI. JK. +### simpleqlearning.py is an implementation of qlearning, an algorithm that Mark learnt from wikipedia [https://en.wikipedia.org/wiki/Q-learning](https://en.wikipedia.org/wiki/Q-learning). Feel free to mess with the learning rate and discountrate in the code and compare the time it takes for the AI to learn how to push the car to the summit. From ba131a8888cdc61762f6f6123bb5650498751b5f Mon Sep 17 00:00:00 2001 From: Devak Patel Date: Wed, 17 Jul 2019 10:10:18 -0400 Subject: [PATCH 04/56] Create EnvironmentIdeas.md --- EnvironmentIdeas.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 EnvironmentIdeas.md diff --git a/EnvironmentIdeas.md b/EnvironmentIdeas.md new file mode 100644 index 000000000..c91c25734 --- /dev/null +++ b/EnvironmentIdeas.md @@ -0,0 +1 @@ +#Idea 1: From 367a1c60c233cb6984b5c72e63bec92593d396d4 Mon Sep 17 00:00:00 2001 From: Devak Patel Date: Wed, 17 Jul 2019 10:14:29 -0400 Subject: [PATCH 05/56] Update and rename EnvironmentIdeas.md to ScenarioIdeas.md --- EnvironmentIdeas.md | 1 - ScenarioIdeas.md | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) delete mode 100644 EnvironmentIdeas.md create mode 100644 ScenarioIdeas.md diff --git a/EnvironmentIdeas.md b/EnvironmentIdeas.md deleted file mode 100644 index c91c25734..000000000 --- a/EnvironmentIdeas.md +++ /dev/null @@ -1 +0,0 @@ -#Idea 1: diff --git a/ScenarioIdeas.md b/ScenarioIdeas.md new file mode 100644 index 000000000..e32eb935e --- /dev/null +++ b/ScenarioIdeas.md @@ -0,0 +1,11 @@ +# Idea table: + +Generated at: https://www.tablesgenerator.com/markdown_tables# \ + +| | Possible Actions | Rewards per Outcome | Properties of other entities | Nash Equilibrium | Other Notes | +|---- |------------------ |--------------------- |------------------------------ |------------------ |------------- | +| #1 | | | | | | +| #2 | | | | | | +| #3 | | | | | | +| #4 | | | | | | +| #5 | | | | | | From 593f2cb285cde6bf5009853dd589d3e7543cc4a9 Mon Sep 17 00:00:00 2001 From: Devak Patel Date: Wed, 17 Jul 2019 10:44:07 -0400 Subject: [PATCH 06/56] Added Scenario Idea 1 --- ScenarioIdeas.md | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/ScenarioIdeas.md b/ScenarioIdeas.md index e32eb935e..b2fa57b3d 100644 --- a/ScenarioIdeas.md +++ b/ScenarioIdeas.md @@ -1,11 +1,24 @@ # Idea table: -Generated at: https://www.tablesgenerator.com/markdown_tables# \ - -| | Possible Actions | Rewards per Outcome | Properties of other entities | Nash Equilibrium | Other Notes | -|---- |------------------ |--------------------- |------------------------------ |------------------ |------------- | -| #1 | | | | | | -| #2 | | | | | | -| #3 | | | | | | -| #4 | | | | | | -| #5 | | | | | | +Generated at: https://www.tablesgenerator.com/markdown_tables + +| | Possible Actions | Rewards per Outcome | Properties of other entities | Nash Equilibrium | Other Notes | +|----|-----------------------|--------------------------------------------------------------------------------------------------------------------------|-------------------------------------|------------------|-------------| +| #1 | Expand, attack, trade | Expanding + attacking spends resources for greater resource bonuses later. Trading gives bonus resources for both agents | No other entities other than agents | Attack | | +| #2 | | | | | | +| #3 | | | | | | +| #4 | | | | | | +| #5 | | | | | | + +# Details: +## Idea 1. (Risk but on a grid) +Grid based cell game, each agent starts with 1 cell on some part of the grid. Agents use resources to expand, attack, or trade with neighboring cells. Every turn agents gain a set amount of resources based on area of agent's cells. For every neighboring cell, if it is not occupied, the agent can choose to spend resources to expand into the area, or not. If the cell is occupied, the agent can choose to attack, or trade. Attacking allows for the takeover of the cell and requires the agent to spend resources. Trading requires the agent to give resources to the other agent, but if both agents decide to trade, they can recieve some bonus based on who gave more resources. If one agent attacks, and the other trades, the attacker automatically wins. If both attack, the agent that spent more resources to attack wins. Resource costs and bonuses can be tweaked to ensure fairness and balance. +### Examples: +Agent A and Agent B are neighbors: if A trades 2 resources, and B trades 4 resources, A could gain 4(from B) + 2(bonus includes how much given) + 1(some multiplier of how much was given in this case 0.5 for giving less) resulting in net +5, B would gain 2(from A) + 4(given) + 4(multiplier bonus of 1 for giving more) resulting in +6 \ +If A attacks B, spending 5 resources; B attempts to trade 4 resources, A takes over some area of B and gains 4 resources from B's trade with net gain of -1 resource and + some area; B has a net gain of -4 resources and -some area. \ +If A attacks B, spending 5 resources; B attacks A spending 6 resources, B takes some area of A. A has a net gain of -5 resources and -some area; B has a net gain of -5 resources and -some area. + +### Possible expansion: +Add defend action, which blocks attack, but opponent agent gains bigger bonus resource if they try to trade. + + From c6ac044e4ab3d550c4a73e0ba9c3de2e701c2c20 Mon Sep 17 00:00:00 2001 From: Tim Date: Wed, 17 Jul 2019 13:10:19 -0400 Subject: [PATCH 07/56] start finding incompatibilities for latest gym version --- .idea/misc.xml | 4 ++++ .idea/modules.xml | 8 ++++++++ .idea/multiagent-particle-envs.iml | 8 ++++++++ .idea/vcs.xml | 6 ++++++ changes.txt | 9 +++++++++ 5 files changed, 35 insertions(+) create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/multiagent-particle-envs.iml create mode 100644 .idea/vcs.xml create mode 100644 changes.txt diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 000000000..65531ca99 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 000000000..20414b498 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/multiagent-particle-envs.iml b/.idea/multiagent-particle-envs.iml new file mode 100644 index 000000000..d0876a78d --- /dev/null +++ b/.idea/multiagent-particle-envs.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 000000000..94a25f7f4 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/changes.txt b/changes.txt new file mode 100644 index 000000000..560f4a9cd --- /dev/null +++ b/changes.txt @@ -0,0 +1,9 @@ +multi_discrete.py: +- Changed random_array assignment in sample() to use gym.utils.seeding +rendering.py: +- The reraise function appears to no longer exist. Fortunately, all this does is add additional information to a raised + exception, so should be simple to reimplement (or ignore) +environment.py: +- line 234: geom.set_color(*entity.color, alpha=0.5) - receives multiple arguments. This is a pain, because it is + expanding the color argument, a 3-tuple or 4-tuple, but we want to set alpha to 0.5. A dumb fix is to make a new + tuple with the first three arguments of the color, and 0.5 for alpha. From 113c199da5ddee2c9287f9f88576c4494e3aff67 Mon Sep 17 00:00:00 2001 From: Tim Date: Wed, 17 Jul 2019 13:17:44 -0400 Subject: [PATCH 08/56] whoops remove editor files --- .gitignore | 3 ++- .idea/misc.xml | 4 ---- .idea/modules.xml | 8 -------- .idea/multiagent-particle-envs.iml | 8 -------- .idea/vcs.xml | 6 ------ 5 files changed, 2 insertions(+), 27 deletions(-) delete mode 100644 .idea/misc.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/multiagent-particle-envs.iml delete mode 100644 .idea/vcs.xml diff --git a/.gitignore b/.gitignore index 4681f8b57..2a0bf53ee 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__/ *.egg-info/ -*.pyc \ No newline at end of file +*.pyc +.idea/ diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 65531ca99..000000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 20414b498..000000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/multiagent-particle-envs.iml b/.idea/multiagent-particle-envs.iml deleted file mode 100644 index d0876a78d..000000000 --- a/.idea/multiagent-particle-envs.iml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7f4..000000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file From 38490a2407f8d35f7d0a1aae20559be8fc9608a5 Mon Sep 17 00:00:00 2001 From: ymkymkymkymx <1435664939@qq.com> Date: Wed, 17 Jul 2019 13:24:03 -0400 Subject: [PATCH 09/56] downgrade --- agents_using_gym/gymMountainCarv0/simpleqlearning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agents_using_gym/gymMountainCarv0/simpleqlearning.py b/agents_using_gym/gymMountainCarv0/simpleqlearning.py index f955d6e5c..3e8ccd7d1 100644 --- a/agents_using_gym/gymMountainCarv0/simpleqlearning.py +++ b/agents_using_gym/gymMountainCarv0/simpleqlearning.py @@ -48,7 +48,7 @@ def getstate(state): # Simulation ended (for any reson) - if goal position is achived - update Q value with reward directly - elif new_state[0] >= env.goal_position: + elif new_state[0] >= 0.5: print("We make it!") print(episode) q_table[currentstate + (action,)] = 0 From 2ef3400085abd67d4d410213be88ce329a4e5629 Mon Sep 17 00:00:00 2001 From: dpakalarry Date: Wed, 17 Jul 2019 13:33:22 -0400 Subject: [PATCH 10/56] Added testing.py to play around with a scenario --- multiagent/scenarios/testing.py | 139 ++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 multiagent/scenarios/testing.py diff --git a/multiagent/scenarios/testing.py b/multiagent/scenarios/testing.py new file mode 100644 index 000000000..9bbf95de4 --- /dev/null +++ b/multiagent/scenarios/testing.py @@ -0,0 +1,139 @@ +import numpy as np +from multiagent.core import World, Agent, Landmark +from multiagent.scenario import BaseScenario + + +class Scenario(BaseScenario): + + def make_world(self): + world = World() + # set any world properties first + world.dim_c = 2 + num_agents = 5 + world.num_agents = num_agents + num_adversaries = 0 + num_landmarks = num_agents - 1 + # add agents + world.agents = [Agent() for i in range(num_agents)] + for i, agent in enumerate(world.agents): + agent.name = 'agent %d' % i + agent.collide = False + agent.silent = True + agent.adversary = True if i < num_adversaries else False + agent.size = 0.15 + # add landmarks + world.landmarks = [Landmark() for i in range(num_landmarks)] + for i, landmark in enumerate(world.landmarks): + landmark.name = 'landmark %d' % i + landmark.collide = False + landmark.movable = False + landmark.size = 0.08 + # make initial conditions + self.reset_world(world) + return world + + def reset_world(self, world): + # random properties for agents + world.agents[0].color = np.array([0.85, 0.35, 0.35]) + for i in range(1, world.num_agents): + world.agents[i].color = np.array([0.35, 0.35, 0.85]) + # random properties for landmarks + for i, landmark in enumerate(world.landmarks): + landmark.color = np.array([0.15, 0.15, 0.15]) + # set goal landmark + goal = np.random.choice(world.landmarks) + goal.color = np.array([0.15, 0.65, 0.15]) + for agent in world.agents: + agent.goal_a = goal + # set random initial states + for agent in world.agents: + agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) + agent.state.p_vel = np.zeros(world.dim_p) + agent.state.c = np.zeros(world.dim_c) + for i, landmark in enumerate(world.landmarks): + landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) + landmark.state.p_vel = np.zeros(world.dim_p) + + def benchmark_data(self, agent, world): + # returns data for benchmarking purposes + if agent.adversary: + return np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) + else: + dists = [] + for l in world.landmarks: + dists.append(np.sum(np.square(agent.state.p_pos - l.state.p_pos))) + dists.append(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) + return tuple(dists) + + # return all agents that are not adversaries + def good_agents(self, world): + return [agent for agent in world.agents if not agent.adversary] + + # return all adversarial agents + def adversaries(self, world): + return [agent for agent in world.agents if agent.adversary] + + def reward(self, agent, world): + # Agents are rewarded based on minimum agent distance to each landmark + return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) + + def agent_reward(self, agent, world): + # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it + shaped_reward = True + shaped_adv_reward = True + + # Calculate negative reward for adversary + adversary_agents = self.adversaries(world) + if shaped_adv_reward: # distance-based adversary reward + adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents]) + else: # proximity-based adversary reward (binary) + adv_rew = 0 + for a in adversary_agents: + if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size: + adv_rew -= 5 + + # Calculate positive reward for agents + good_agents = self.good_agents(world) + if shaped_reward: # distance-based agent reward + pos_rew = -min( + [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) + else: # proximity-based agent reward (binary) + pos_rew = 0 + if min([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) \ + < 2 * agent.goal_a.size: + pos_rew += 5 + pos_rew -= min( + [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) + return pos_rew + adv_rew + + def adversary_reward(self, agent, world): + # Rewarded based on proximity to the goal landmark + shaped_reward = True + if shaped_reward: # distance-based reward + return -np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) + else: # proximity-based reward (binary) + adv_rew = 0 + if np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) < 2 * agent.goal_a.size: + adv_rew += 5 + return adv_rew + + + def observation(self, agent, world): + # get positions of all entities in this agent's reference frame + entity_pos = [] + for entity in world.landmarks: + entity_pos.append(entity.state.p_pos - agent.state.p_pos) + # entity colors + entity_color = [] + for entity in world.landmarks: + entity_color.append(entity.color) + # communication of all other agents + other_pos = [] + for other in world.agents: + if other is agent: continue + other_pos.append(other.state.p_pos - agent.state.p_pos) + + if not agent.adversary: + return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + entity_pos + other_pos) + else: + return np.concatenate(entity_pos + other_pos) From b68236cf1a37f232e988c493a5126f20a3a5d7b8 Mon Sep 17 00:00:00 2001 From: Tim Date: Wed, 17 Jul 2019 13:39:52 -0400 Subject: [PATCH 11/56] weird simple_crypto dimension mismatch --- changes.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/changes.txt b/changes.txt index 560f4a9cd..af7cbe1b5 100644 --- a/changes.txt +++ b/changes.txt @@ -4,6 +4,11 @@ rendering.py: - The reraise function appears to no longer exist. Fortunately, all this does is add additional information to a raised exception, so should be simple to reimplement (or ignore) environment.py: -- line 234: geom.set_color(*entity.color, alpha=0.5) - receives multiple arguments. This is a pain, because it is +- line 234: `geom.set_color(*entity.color, alpha=0.5)` - receives multiple arguments. This is a pain, because it is expanding the color argument, a 3-tuple or 4-tuple, but we want to set alpha to 0.5. A dumb fix is to make a new tuple with the first three arguments of the color, and 0.5 for alpha. + +simple_crypto.py: +- line 121: array in conditional can potentially be a boolean rather than an ndarray so it will have no .all() method +- line 122: the sizes in the expression `agent.state.c - agent.goal_a.color` are mismatched and it is unclear where they + come from \ No newline at end of file From d7e489a62a672dd4deae0a87ca5b36289e09547c Mon Sep 17 00:00:00 2001 From: Tim Date: Wed, 17 Jul 2019 13:43:21 -0400 Subject: [PATCH 12/56] switch from prng to gym.utils.seeding --- changes.txt | 4 +++- multiagent/multi_discrete.py | 7 +++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/changes.txt b/changes.txt index af7cbe1b5..04dc7d817 100644 --- a/changes.txt +++ b/changes.txt @@ -1,8 +1,10 @@ multi_discrete.py: -- Changed random_array assignment in sample() to use gym.utils.seeding +- [FIXED] Changed random_array assignment in sample() to use gym.utils.seeding + rendering.py: - The reraise function appears to no longer exist. Fortunately, all this does is add additional information to a raised exception, so should be simple to reimplement (or ignore) + environment.py: - line 234: `geom.set_color(*entity.color, alpha=0.5)` - receives multiple arguments. This is a pain, because it is expanding the color argument, a 3-tuple or 4-tuple, but we want to set alpha to 0.5. A dumb fix is to make a new diff --git a/multiagent/multi_discrete.py b/multiagent/multi_discrete.py index d7108ad43..041484729 100644 --- a/multiagent/multi_discrete.py +++ b/multiagent/multi_discrete.py @@ -4,7 +4,8 @@ import numpy as np import gym -from gym.spaces import prng +from gym.utils import seeding + class MultiDiscrete(gym.Space): """ @@ -27,10 +28,12 @@ def __init__(self, array_of_param_array): self.high = np.array([x[1] for x in array_of_param_array]) self.num_discrete_space = self.low.shape[0] + self.random = seeding.np_random() + def sample(self): """ Returns a array with one sample from each discrete action space """ # For each row: round(random .* (max - min) + min, 0) - random_array = prng.np_random.rand(self.num_discrete_space) + random_array = self.random.rand(self.num_discrete_space) return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)] def contains(self, x): return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all() From 2db0588d0d182ca8702a83672c117692cfea6201 Mon Sep 17 00:00:00 2001 From: Tim Date: Wed, 17 Jul 2019 13:47:31 -0400 Subject: [PATCH 13/56] fix environment argument splatting error with color --- changes.txt | 2 +- multiagent/environment.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/changes.txt b/changes.txt index 04dc7d817..d4eea280c 100644 --- a/changes.txt +++ b/changes.txt @@ -6,7 +6,7 @@ rendering.py: exception, so should be simple to reimplement (or ignore) environment.py: -- line 234: `geom.set_color(*entity.color, alpha=0.5)` - receives multiple arguments. This is a pain, because it is +- [FIXED] line 234: `geom.set_color(*entity.color, alpha=0.5)` - receives multiple arguments. This is a pain, because it is expanding the color argument, a 3-tuple or 4-tuple, but we want to set alpha to 0.5. A dumb fix is to make a new tuple with the first three arguments of the color, and 0.5 for alpha. diff --git a/multiagent/environment.py b/multiagent/environment.py index d2e8d3278..69efccde1 100644 --- a/multiagent/environment.py +++ b/multiagent/environment.py @@ -231,7 +231,8 @@ def render(self, mode='human'): geom = rendering.make_circle(entity.size) xform = rendering.Transform() if 'agent' in entity.name: - geom.set_color(*entity.color, alpha=0.5) + color = (entity.color[0], entity.color[1], entity.color[2], 0.5) + geom.set_color(*color) else: geom.set_color(*entity.color) geom.add_attr(xform) From c7dfadf866da6277c35fbdcfcfce6fa7829f7ec9 Mon Sep 17 00:00:00 2001 From: dpakalarry Date: Wed, 17 Jul 2019 13:54:02 -0400 Subject: [PATCH 14/56] Fixed reraise error --- multiagent/rendering.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/multiagent/rendering.py b/multiagent/rendering.py index cd00c7fb8..3962d8a04 100644 --- a/multiagent/rendering.py +++ b/multiagent/rendering.py @@ -17,13 +17,24 @@ try: import pyglet except ImportError as e: - reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.") + #reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.") + raise ImportError(''' + Cannot import pyglet. + HINT: you can install pyglet directly via 'pip install pyglet'. + But if you really just want to install all Gym dependencies and not have to think about it, + 'pip install -e .[all]' or 'pip install gym[all]' will do it. + ''') try: from pyglet.gl import * except ImportError as e: - reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python '") - + #reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python '") + raise ImportError(''' + Error occured while running `from pyglet.gl import *` + HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. + If you're running on a server, you may need a virtual frame buffer; something like this should work: + 'xvfb-run -s \"-screen 0 1400x900x24\" python ' + ''') import math import numpy as np From 0732a28ec4a3917fd1d84ed9a1bfa73c2fd6e2f8 Mon Sep 17 00:00:00 2001 From: dpakalarry Date: Wed, 17 Jul 2019 13:54:33 -0400 Subject: [PATCH 15/56] Update changes.txt --- changes.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes.txt b/changes.txt index d4eea280c..5e1d094fa 100644 --- a/changes.txt +++ b/changes.txt @@ -2,7 +2,7 @@ multi_discrete.py: - [FIXED] Changed random_array assignment in sample() to use gym.utils.seeding rendering.py: -- The reraise function appears to no longer exist. Fortunately, all this does is add additional information to a raised +- [FIXED] The reraise function appears to no longer exist. Fortunately, all this does is add additional information to a raised exception, so should be simple to reimplement (or ignore) environment.py: From dddd55989b4a724537e95017707e12c74a878e8d Mon Sep 17 00:00:00 2001 From: jarbus Date: Wed, 17 Jul 2019 15:15:28 -0400 Subject: [PATCH 16/56] adding documentation --- documentation.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 documentation.md diff --git a/documentation.md b/documentation.md new file mode 100644 index 000000000..b4db86409 --- /dev/null +++ b/documentation.md @@ -0,0 +1,48 @@ +# Environment + +- `make_env.py`: contains code for importing a multiagent environment as an OpenAI Gym-like object. + +- `./multiagent/environment.py`: contains code for environment simulation (interaction physics, `_step()` function, etc.) + +# Policy + +A policy seems to be a system to control an agent. The interactive policy allows control of an agent with keyboard and mouse, but if we wish to implement algorithms we will most likely be implementing them as a policy. + +- `./multiagent/policy.py`: contains code for interactive policy based on keyboard input. + + +# Scenarios + +- `./multiagent/scenario.py`: contains base scenario object that is extended for all scenarios. + +- `./multiagent/scenarios/`: folder where various scenarios/ environments are stored. scenario code consists of several functions: + 1) `make_world()`: creates all of the entities that inhabit the world (landmarks, agents, etc.), assigns their capabilities (whether they can communicate, or move, or both). + called once at the beginning of each training session + 2) `reset_world()`: resets the world by assigning properties (position, color, etc.) to all entities in the world + called before every episode (including after make_world() before the first episode) + 3) `reward()`: defines the reward function for a given agent + 4) `observation()`: defines the observation space of a given agent + 5) (optional) `benchmark_data()`: provides diagnostic data for policies trained on the environment (e.g. evaluation metrics) + +You can create new scenarios by implementing the first 4 functions above (`make_world()`, `reset_world()`, `reward()`, and `observation()`). + +# Miscellaneous + +- `./multiagent/core.py`: contains classes for various objects (Entities, Landmarks, Agents, etc.) that are used throughout the code. + +- `./multiagent/rendering.py`: used for displaying agent behaviors on the screen. + +# Execution: + +1. bin/script.py loads - acts as main script +2. Loads scenario + - Uses scenario to generate world +3. Loads mutli-agent enviroment given scenario settings and world +4. Renders environment (initial render) +5. Assigns policies (algorithms) for each agent + - stored as policies[] list +6. Resets environment +7. Infinite while loop + 1. Makes a list of actions, one action per policy + 2. Performs one environment step using entire action list + 3. Re-render From eb09d38689cb0dd360ba7330183a05a2a41b4181 Mon Sep 17 00:00:00 2001 From: jarbus Date: Wed, 17 Jul 2019 19:54:27 -0400 Subject: [PATCH 17/56] More documentation --- documentation.md | 68 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/documentation.md b/documentation.md index b4db86409..4cb09cec2 100644 --- a/documentation.md +++ b/documentation.md @@ -1,17 +1,55 @@ -# Environment +# Execution: + +In a simulation with `n` agents: + +1. bin/script.py loads - acts as main script +2. Loads scenario + - `./multiagent/scenarios/scenario.py.make_world()` +3. Loads multi-agent enviroment given scenario settings and world + - `./multiagent/environment.py.MultiAgentEnv(Scenario.world())` +4. Renders environment (initial render) + - `./multiagent/environment.py.render()` +5. Assigns policies (algorithms) for each agent + - stored as policies[] list + - policy[agent_index] = ./multiagent/policies/template.py.TemplatePolicy(env,agent_index) + - Note: Template not implemented yet, see `./multiagent/policy.py.InteractivePolicy()` for now + - For more information, see [Policies](#POLICIES) +6. Resets environment +7. Infinite while loop + 1. Makes a list of actions, one action per policy + - actions[i] + 2. Performs one environment step using entire action list + - `multiagent/environment.py.step()` returns: + - n observations + - n rewards + - n done states + - n debug objects + 3. Re-render + - `multiagent/environment.py.render()` + +## Environment + +The main class in use during execution. The environment interacts with the scenario and the agents. There is one environment that all scenarios use. Each scenario implements reward() and observation() which the environment calls. -- `make_env.py`: contains code for importing a multiagent environment as an OpenAI Gym-like object. +- `./make_env.py`: contains code for importing a multiagent environment as an OpenAI Gym-like object. - `./multiagent/environment.py`: contains code for environment simulation (interaction physics, `_step()` function, etc.) -# Policy +## Policy -A policy seems to be a system to control an agent. The interactive policy allows control of an agent with keyboard and mouse, but if we wish to implement algorithms we will most likely be implementing them as a policy. +A policy seems to be a system to control an agent. The interactive policy allows control of an agent with keyboard and mouse, but if we wish to implement algorithms we will most likely be implementing them as a policy. **NOTE: Policies are enumerable** - `./multiagent/policy.py`: contains code for interactive policy based on keyboard input. +A Policy has two functions: -# Scenarios +- `__init__()` passes the environment to the policy class +- `action(obs)` performs an action given an observation + + +## Scenarios + +A BaseScenario `multiagent/scenario.py` incorporates at least `make_world()` and `reset_world()`. An implemented Scenario will incorporate reward() and observation(). All scenario calls are made through the environment. - `./multiagent/scenario.py`: contains base scenario object that is extended for all scenarios. @@ -20,29 +58,15 @@ A policy seems to be a system to control an agent. The interactive policy allows called once at the beginning of each training session 2) `reset_world()`: resets the world by assigning properties (position, color, etc.) to all entities in the world called before every episode (including after make_world() before the first episode) - 3) `reward()`: defines the reward function for a given agent - 4) `observation()`: defines the observation space of a given agent + 3) `reward(agent,world)`: defines the reward function for a given agent + 4) `observation(agent, world)`: defines the observation space of a given agent 5) (optional) `benchmark_data()`: provides diagnostic data for policies trained on the environment (e.g. evaluation metrics) You can create new scenarios by implementing the first 4 functions above (`make_world()`, `reset_world()`, `reward()`, and `observation()`). -# Miscellaneous +## Miscellaneous - `./multiagent/core.py`: contains classes for various objects (Entities, Landmarks, Agents, etc.) that are used throughout the code. - `./multiagent/rendering.py`: used for displaying agent behaviors on the screen. -# Execution: - -1. bin/script.py loads - acts as main script -2. Loads scenario - - Uses scenario to generate world -3. Loads mutli-agent enviroment given scenario settings and world -4. Renders environment (initial render) -5. Assigns policies (algorithms) for each agent - - stored as policies[] list -6. Resets environment -7. Infinite while loop - 1. Makes a list of actions, one action per policy - 2. Performs one environment step using entire action list - 3. Re-render From 990ca856be8873d6aef025e5713e4aa64b011612 Mon Sep 17 00:00:00 2001 From: dpakalarry Date: Fri, 19 Jul 2019 21:09:24 -0400 Subject: [PATCH 18/56] Added some comments to testing.py to better understand --- multiagent/scenarios/testing.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/multiagent/scenarios/testing.py b/multiagent/scenarios/testing.py index 9bbf95de4..9a46a9817 100644 --- a/multiagent/scenarios/testing.py +++ b/multiagent/scenarios/testing.py @@ -6,7 +6,7 @@ class Scenario(BaseScenario): def make_world(self): - world = World() + world = World() #World has agents and landmarks # set any world properties first world.dim_c = 2 num_agents = 5 @@ -104,8 +104,9 @@ def agent_reward(self, agent, world): pos_rew += 5 pos_rew -= min( [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) - return pos_rew + adv_rew + return pos_rew + adv_rew #Rewards are a simple int + #Adversaries are given rewards def adversary_reward(self, agent, world): # Rewarded based on proximity to the goal landmark shaped_reward = True @@ -118,6 +119,7 @@ def adversary_reward(self, agent, world): return adv_rew + #What is passed to the agent ie How they see the world def observation(self, agent, world): # get positions of all entities in this agent's reference frame entity_pos = [] From 8a6cad26adef142c37ae83075faf1d0ae1325f38 Mon Sep 17 00:00:00 2001 From: zrysnd <43715612+zrysnd@users.noreply.github.com> Date: Sat, 20 Jul 2019 13:20:05 -0400 Subject: [PATCH 19/56] Path for "scenario.py" in the documentation --- documentation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation.md b/documentation.md index 4cb09cec2..a72097df1 100644 --- a/documentation.md +++ b/documentation.md @@ -4,7 +4,7 @@ In a simulation with `n` agents: 1. bin/script.py loads - acts as main script 2. Loads scenario - - `./multiagent/scenarios/scenario.py.make_world()` + - `./multiagent/scenario.py.make_world()` 3. Loads multi-agent enviroment given scenario settings and world - `./multiagent/environment.py.MultiAgentEnv(Scenario.world())` 4. Renders environment (initial render) From fcbd86ba8989cd736fb3c34c2d535ce92e6e9691 Mon Sep 17 00:00:00 2001 From: Brin775 <43180128+Brin775@users.noreply.github.com> Date: Sat, 20 Jul 2019 15:11:59 -0400 Subject: [PATCH 20/56] More info on simple_crypto.py --- changes.txt | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/changes.txt b/changes.txt index 5e1d094fa..95529c034 100644 --- a/changes.txt +++ b/changes.txt @@ -12,5 +12,12 @@ environment.py: simple_crypto.py: - line 121: array in conditional can potentially be a boolean rather than an ndarray so it will have no .all() method + - Similar statements appear on lines 104, 109 - line 122: the sizes in the expression `agent.state.c - agent.goal_a.color` are mismatched and it is unclear where they - come from \ No newline at end of file + come from + - Similar statements appear on lines 107, 112 +- [Austen] I got it to run by removing the .all() calls and changing lines like + np.sum(np.square(agent.state.c - agent.goal_a.color)) + to + np.square(len(agent.state.c)-len(agent.goal_a.color)) + Not sure if scenario still functions correctly / if reward calculations are accurate From b7ceac72940d001fb0f1546d324412d303a5fe27 Mon Sep 17 00:00:00 2001 From: zrysnd <43715612+zrysnd@users.noreply.github.com> Date: Sat, 20 Jul 2019 15:36:48 -0400 Subject: [PATCH 21/56] More on multiagent/core.py --- documentation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation.md b/documentation.md index a72097df1..11ed36a06 100644 --- a/documentation.md +++ b/documentation.md @@ -66,7 +66,7 @@ You can create new scenarios by implementing the first 4 functions above (`make_ ## Miscellaneous -- `./multiagent/core.py`: contains classes for various objects (Entities, Landmarks, Agents, etc.) that are used throughout the code. +- `./multiagent/core.py`: contains classes for various objects (Entities, Landmarks, Agents, etc.) that are used throughout the code.(used for creating a scenario. We might need customized entities, agents for our own scenarios.) - `./multiagent/rendering.py`: used for displaying agent behaviors on the screen. From 4314fb66f15b9aa10e61161415fa5e6bf766ca0d Mon Sep 17 00:00:00 2001 From: jarbus Date: Sat, 20 Jul 2019 16:06:28 -0400 Subject: [PATCH 22/56] compatible? --- multiagent/rendering.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/multiagent/rendering.py b/multiagent/rendering.py index 3962d8a04..d72f1b98e 100644 --- a/multiagent/rendering.py +++ b/multiagent/rendering.py @@ -11,7 +11,7 @@ os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib' # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite -from gym.utils import reraise +#from gym.utils import reraise from gym import error try: @@ -353,4 +353,4 @@ def close(self): self.window.close() self.isopen = False def __del__(self): - self.close() \ No newline at end of file + self.close() From 28de1ad212865d5546af8952aab1ab891634353f Mon Sep 17 00:00:00 2001 From: Devak Patel Date: Sat, 20 Jul 2019 16:22:41 -0400 Subject: [PATCH 23/56] Added race scenario --- ScenarioIdeas.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ScenarioIdeas.md b/ScenarioIdeas.md index b2fa57b3d..ca603279a 100644 --- a/ScenarioIdeas.md +++ b/ScenarioIdeas.md @@ -3,9 +3,9 @@ Generated at: https://www.tablesgenerator.com/markdown_tables | | Possible Actions | Rewards per Outcome | Properties of other entities | Nash Equilibrium | Other Notes | -|----|-----------------------|--------------------------------------------------------------------------------------------------------------------------|-------------------------------------|------------------|-------------| +|----|-----------------------|--------------------------------------------------------------------------------------------------------------------------|-------------------------------------|-------------------------|-------------| | #1 | Expand, attack, trade | Expanding + attacking spends resources for greater resource bonuses later. Trading gives bonus resources for both agents | No other entities other than agents | Attack | | -| #2 | | | | | | +| #2 | Move x steps | Agents try to close distance to flag | No other entities other than agents | Move as far as possible | | | #3 | | | | | | | #4 | | | | | | | #5 | | | | | | @@ -21,4 +21,5 @@ If A attacks B, spending 5 resources; B attacks A spending 6 resources, B takes ### Possible expansion: Add defend action, which blocks attack, but opponent agent gains bigger bonus resource if they try to trade. - +## Idea 2. Race +2D plane where agents try to race to their landmark. Agents can take any x number of steps to advance to the landmark. If the sum of all the steps taken by the agents(y) exceeds z, then all agents that moved get moved backwards w steps. Agents are rewarded for reaching the landmark. From dd886dd9ac7698ea933f1636ba944cd8905fce62 Mon Sep 17 00:00:00 2001 From: Devak Patel Date: Sat, 20 Jul 2019 16:23:31 -0400 Subject: [PATCH 24/56] Update ScenarioIdeas.md --- ScenarioIdeas.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ScenarioIdeas.md b/ScenarioIdeas.md index ca603279a..795d9709b 100644 --- a/ScenarioIdeas.md +++ b/ScenarioIdeas.md @@ -22,4 +22,4 @@ If A attacks B, spending 5 resources; B attacks A spending 6 resources, B takes Add defend action, which blocks attack, but opponent agent gains bigger bonus resource if they try to trade. ## Idea 2. Race -2D plane where agents try to race to their landmark. Agents can take any x number of steps to advance to the landmark. If the sum of all the steps taken by the agents(y) exceeds z, then all agents that moved get moved backwards w steps. Agents are rewarded for reaching the landmark. +2D plane where agents try to race to their landmark. Agents can take any x number of steps to advance to the landmark. If the sum of all the steps taken by the agents(y) exceeds z, then all agents that moved get moved backwards w steps. Agents are rewarded for reaching the landmark. Parameters y, z, and the initial distance for each agent to the landmark can be varied for balance and to compare agent behavior. From 77e94869237f1ad08be6614e0265af4860f04209 Mon Sep 17 00:00:00 2001 From: dpakalarry Date: Sat, 20 Jul 2019 16:38:24 -0400 Subject: [PATCH 25/56] Update testing.py --- multiagent/scenarios/testing.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/multiagent/scenarios/testing.py b/multiagent/scenarios/testing.py index 9a46a9817..70c448c14 100644 --- a/multiagent/scenarios/testing.py +++ b/multiagent/scenarios/testing.py @@ -8,11 +8,11 @@ class Scenario(BaseScenario): def make_world(self): world = World() #World has agents and landmarks # set any world properties first - world.dim_c = 2 - num_agents = 5 + world.dim_c = 0 + num_agents = 2 #Change this to add agents world.num_agents = num_agents num_adversaries = 0 - num_landmarks = num_agents - 1 + num_landmarks = num_agents # add agents world.agents = [Agent() for i in range(num_agents)] for i, agent in enumerate(world.agents): @@ -45,14 +45,14 @@ def reset_world(self, world): goal.color = np.array([0.15, 0.65, 0.15]) for agent in world.agents: agent.goal_a = goal - # set random initial states + # set random initial states TODO: Initialize agents + landmarks to set positions with 0 velocity for agent in world.agents: agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) - agent.state.p_vel = np.zeros(world.dim_p) + agent.state.p_vel = 0 agent.state.c = np.zeros(world.dim_c) for i, landmark in enumerate(world.landmarks): landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) - landmark.state.p_vel = np.zeros(world.dim_p) + landmark.state.p_vel = 0 def benchmark_data(self, agent, world): # returns data for benchmarking purposes @@ -77,7 +77,7 @@ def reward(self, agent, world): # Agents are rewarded based on minimum agent distance to each landmark return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) - def agent_reward(self, agent, world): + def agent_reward(self, agent, world): #TODO: set reward to distance to goal landmark, remove adversary stuff # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it shaped_reward = True shaped_adv_reward = True @@ -129,7 +129,7 @@ def observation(self, agent, world): entity_color = [] for entity in world.landmarks: entity_color.append(entity.color) - # communication of all other agents + # communication of all other Agents TODO: remove communication other_pos = [] for other in world.agents: if other is agent: continue From f643ac2a893670a1ae5dc4129176dd51ec760312 Mon Sep 17 00:00:00 2001 From: dpakalarry Date: Sat, 20 Jul 2019 16:55:53 -0400 Subject: [PATCH 26/56] Update testing.py --- multiagent/scenarios/testing.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/multiagent/scenarios/testing.py b/multiagent/scenarios/testing.py index 70c448c14..3ed98c815 100644 --- a/multiagent/scenarios/testing.py +++ b/multiagent/scenarios/testing.py @@ -40,18 +40,15 @@ def reset_world(self, world): # random properties for landmarks for i, landmark in enumerate(world.landmarks): landmark.color = np.array([0.15, 0.15, 0.15]) - # set goal landmark - goal = np.random.choice(world.landmarks) - goal.color = np.array([0.15, 0.65, 0.15]) for agent in world.agents: agent.goal_a = goal # set random initial states TODO: Initialize agents + landmarks to set positions with 0 velocity - for agent in world.agents: - agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) + for i, agent in enumerate(world.agents): + agent.state.p_pos = np.array([i/2,0]) agent.state.p_vel = 0 agent.state.c = np.zeros(world.dim_c) for i, landmark in enumerate(world.landmarks): - landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) + landmark.state.p_pos = np.array([i,5]) landmark.state.p_vel = 0 def benchmark_data(self, agent, world): @@ -69,15 +66,16 @@ def benchmark_data(self, agent, world): def good_agents(self, world): return [agent for agent in world.agents if not agent.adversary] - # return all adversarial agents - def adversaries(self, world): - return [agent for agent in world.agents if agent.adversary] + # # return all adversarial agents + # def adversaries(self, world): + # return [agent for agent in world.agents if agent.adversary] def reward(self, agent, world): + return self.agent_reward(agent,world) # Agents are rewarded based on minimum agent distance to each landmark - return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) + # return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) - def agent_reward(self, agent, world): #TODO: set reward to distance to goal landmark, remove adversary stuff + def agent_reward(self, agent, world): #TODO: set reward to distance to their landmark, remove adversary stuff # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it shaped_reward = True shaped_adv_reward = True From bd12a438ae017ce17d87cf080a952b441e24e77e Mon Sep 17 00:00:00 2001 From: linlinbest <444053358@qq.com> Date: Sat, 20 Jul 2019 17:23:23 -0400 Subject: [PATCH 27/56] Modified policy.py so that agents can go to the landmark automatically. No learning algorithms implemented yet. --- multiagent/policy.py | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/multiagent/policy.py b/multiagent/policy.py index cf9ad0e1b..8d70f9aef 100644 --- a/multiagent/policy.py +++ b/multiagent/policy.py @@ -1,6 +1,8 @@ import numpy as np from pyglet.window import key +from multiagent.scenarios.simple import Scenario + # individual agent policy class Policy(object): def __init__(self): @@ -14,6 +16,7 @@ class InteractivePolicy(Policy): def __init__(self, env, agent_index): super(InteractivePolicy, self).__init__() self.env = env + #self.agent_index = agent_index # hard-coded keyboard events self.move = [False for i in range(4)] self.comm = [False for i in range(env.world.dim_c)] @@ -23,6 +26,28 @@ def __init__(self, env, agent_index): def action(self, obs): # ignore observation and just act based on keyboard events + + + #x_axis = self.env.agents[self.agent_index].state.p_pos[0] + #y_axis = self.env.agents[self.agent_index].state.p_pos[1] + + if obs[2] < 0: + self.move[1] = True + elif obs[2] > 0: + self.move[0] = True + else: + self.move[0] = False + self.move[1] = False + + if obs[3] > 0: + self.move[3] = True + elif obs[3] < 0: + self.move[2] = True + else: + self.move[2] = False + self.move[3] = False + + if self.env.discrete_action_input: u = 0 if self.move[0]: u = 1 @@ -31,12 +56,12 @@ def action(self, obs): if self.move[3]: u = 3 else: u = np.zeros(5) # 5-d because of no-move action - if self.move[0]: u[1] += 1.0 - if self.move[1]: u[2] += 1.0 - if self.move[3]: u[3] += 1.0 - if self.move[2]: u[4] += 1.0 + if self.move[0]: u[1] += 0.01 + if self.move[1]: u[2] += 0.01 + if self.move[3]: u[3] += 0.01 + if self.move[2]: u[4] += 0.01 if True not in self.move: - u[0] += 1.0 + u[0] += 0.01 return np.concatenate([u, np.zeros(self.env.world.dim_c)]) # keyboard event callbacks From f26f54c84adda56c96be5b162c9a7abe36a6b08f Mon Sep 17 00:00:00 2001 From: zrysnd <43715612+zrysnd@users.noreply.github.com> Date: Sat, 20 Jul 2019 17:37:02 -0400 Subject: [PATCH 28/56] more details --- documentation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation.md b/documentation.md index 11ed36a06..c4721afe8 100644 --- a/documentation.md +++ b/documentation.md @@ -62,7 +62,7 @@ A BaseScenario `multiagent/scenario.py` incorporates at least `make_world()` and 4) `observation(agent, world)`: defines the observation space of a given agent 5) (optional) `benchmark_data()`: provides diagnostic data for policies trained on the environment (e.g. evaluation metrics) -You can create new scenarios by implementing the first 4 functions above (`make_world()`, `reset_world()`, `reward()`, and `observation()`). +You can create new scenarios by implementing the first 4 functions above (`make_world()`, `reset_world()`, `reward()`, and `observation()`), and have to keep the same function signature(can't not change parameters). ## Miscellaneous From ab27aaa82c3763b3d14242f4736ccd285991671c Mon Sep 17 00:00:00 2001 From: zrysnd <43715612+zrysnd@users.noreply.github.com> Date: Sat, 20 Jul 2019 17:39:33 -0400 Subject: [PATCH 29/56] more details --- documentation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation.md b/documentation.md index c4721afe8..4cdb2c007 100644 --- a/documentation.md +++ b/documentation.md @@ -62,7 +62,7 @@ A BaseScenario `multiagent/scenario.py` incorporates at least `make_world()` and 4) `observation(agent, world)`: defines the observation space of a given agent 5) (optional) `benchmark_data()`: provides diagnostic data for policies trained on the environment (e.g. evaluation metrics) -You can create new scenarios by implementing the first 4 functions above (`make_world()`, `reset_world()`, `reward()`, and `observation()`), and have to keep the same function signature(can't not change parameters). +You can create new scenarios by implementing the first 4 functions above (`make_world()`, `reset_world()`, `reward()`, and `observation()`), and have to keep the same function signature(can't not change parameters), unless we all make changes to multiagent/environment. ## Miscellaneous From b1928194851c965cc2615fcaaa644f7f92dce5e5 Mon Sep 17 00:00:00 2001 From: zrysnd <503591415@qq.com> Date: Sat, 20 Jul 2019 17:41:59 -0400 Subject: [PATCH 30/56] added reward and observation function to BaseScenario --- multiagent/scenario.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/multiagent/scenario.py b/multiagent/scenario.py index 02d86773e..1718ea9bf 100644 --- a/multiagent/scenario.py +++ b/multiagent/scenario.py @@ -8,3 +8,7 @@ def make_world(self): # create initial conditions of the world def reset_world(self, world): raise NotImplementedError() + def reward(self, agent, world): + raise NotImplementedError() + def observation(self, agent, world): + raise NotImplementedError() From 51e2cb83b3c5c99d0313ebff6ea4b74d223e7b1c Mon Sep 17 00:00:00 2001 From: zrysnd <503591415@qq.com> Date: Sat, 20 Jul 2019 18:09:43 -0400 Subject: [PATCH 31/56] a customized scenario --- multiagent/scenarios/cus.py | 53 +++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 multiagent/scenarios/cus.py diff --git a/multiagent/scenarios/cus.py b/multiagent/scenarios/cus.py new file mode 100644 index 000000000..7e169701a --- /dev/null +++ b/multiagent/scenarios/cus.py @@ -0,0 +1,53 @@ +import numpy as np +from multiagent.core import World, Agent, Landmark +from multiagent.scenario import BaseScenario + +class Scenario(BaseScenario): + def make_world(self): + world = World() + # add agents + world.agents = [Agent() for i in range(1)] + for i, agent in enumerate(world.agents): + agent.name = 'agent %d' % i + agent.collide = False + agent.silent = True + # add landmarks + world.landmarks = [Landmark() for i in range(1)] + for i, landmark in enumerate(world.landmarks): + landmark.name = 'landmark %d' % i + landmark.collide = False + landmark.movable = False + # make initial conditions + self.reset_world(world) + return world + + def reset_world(self, world): + # random properties for agents + for i, agent in enumerate(world.agents): + agent.color = np.array([0.25,0.25,0.25]) + # random properties for landmarks + for i, landmark in enumerate(world.landmarks): + landmark.color = np.array([0.75,0.75,0.75]) + world.landmarks[0].color = np.array([0.75,0.25,0.25]) + # set random initial states + for agent in world.agents: + agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) + agent.state.p_vel = np.zeros(world.dim_p) + agent.state.c = np.zeros(world.dim_c) + for i, landmark in enumerate(world.landmarks): + landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) + landmark.state.p_vel = np.zeros(world.dim_p) + + def reward(self, agent, world): + # dist2 = np.sum(np.square(agent.state.p_pos - world.landmarks[0].state.p_pos)) + # dist2 = world.landmarks[0].state.p_pos + delta_pos = agent.state.p_pos - world.landmarks[0].state.p_pos + dist = np.sqrt(np.sum(np.square(delta_pos))) + return dist + + def observation(self, agent, world): + # get positions of all entities in this agent's reference frame + entity_pos = [] + for entity in world.landmarks: + entity_pos.append(entity.state.p_pos - agent.state.p_pos) + return np.concatenate([agent.state.p_vel] + entity_pos) From ee9d0687efac7632252f1f65da4413363117fa0c Mon Sep 17 00:00:00 2001 From: zrysnd <503591415@qq.com> Date: Sat, 20 Jul 2019 18:20:08 -0400 Subject: [PATCH 32/56] adding dictionary --- multiagent/scenarios/cus.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/multiagent/scenarios/cus.py b/multiagent/scenarios/cus.py index 7e169701a..0b65e0342 100644 --- a/multiagent/scenarios/cus.py +++ b/multiagent/scenarios/cus.py @@ -3,6 +3,10 @@ from multiagent.scenario import BaseScenario class Scenario(BaseScenario): + def __init__(self): + super(Scenario, self).__init__() + self.agentsToLandMarks = None + def make_world(self): world = World() # add agents From e2f2fdeb19052ceb0c1a2a18b4f18b8e9e19d74f Mon Sep 17 00:00:00 2001 From: zrysnd <503591415@qq.com> Date: Sat, 20 Jul 2019 18:34:54 -0400 Subject: [PATCH 33/56] remards based on distance between agent and its target --- multiagent/scenarios/cus.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/multiagent/scenarios/cus.py b/multiagent/scenarios/cus.py index 0b65e0342..4de51ce58 100644 --- a/multiagent/scenarios/cus.py +++ b/multiagent/scenarios/cus.py @@ -5,22 +5,27 @@ class Scenario(BaseScenario): def __init__(self): super(Scenario, self).__init__() - self.agentsToLandMarks = None - + self.agentsToLandMarks = {} + def make_world(self): world = World() # add agents - world.agents = [Agent() for i in range(1)] + numberOfAgents = 2; + world.agents = [Agent() for i in range(numberOfAgents)] for i, agent in enumerate(world.agents): agent.name = 'agent %d' % i agent.collide = False agent.silent = True # add landmarks - world.landmarks = [Landmark() for i in range(1)] + world.landmarks = [Landmark() for i in range(numberOfAgents)] for i, landmark in enumerate(world.landmarks): landmark.name = 'landmark %d' % i landmark.collide = False landmark.movable = False + #fill in the dictionary + for i in range(numberOfAgents): + self.agentsToLandMarks.update({ world.agents[i]: world.landmarks[i] }) + # make initial conditions self.reset_world(world) return world @@ -45,7 +50,7 @@ def reset_world(self, world): def reward(self, agent, world): # dist2 = np.sum(np.square(agent.state.p_pos - world.landmarks[0].state.p_pos)) # dist2 = world.landmarks[0].state.p_pos - delta_pos = agent.state.p_pos - world.landmarks[0].state.p_pos + delta_pos = agent.state.p_pos - self.agentsToLandMarks[agent].state.p_pos dist = np.sqrt(np.sum(np.square(delta_pos))) return dist From 54d39c6b8b3560a579960551b477e295c42c0b72 Mon Sep 17 00:00:00 2001 From: zrysnd <503591415@qq.com> Date: Sat, 20 Jul 2019 18:38:22 -0400 Subject: [PATCH 34/56] minor changes --- multiagent/scenarios/cus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multiagent/scenarios/cus.py b/multiagent/scenarios/cus.py index 4de51ce58..8b53fff0d 100644 --- a/multiagent/scenarios/cus.py +++ b/multiagent/scenarios/cus.py @@ -10,7 +10,7 @@ def __init__(self): def make_world(self): world = World() # add agents - numberOfAgents = 2; + numberOfAgents = 1; world.agents = [Agent() for i in range(numberOfAgents)] for i, agent in enumerate(world.agents): agent.name = 'agent %d' % i From 883ccaf378d0b051962538ce6b0b8d345e13d5af Mon Sep 17 00:00:00 2001 From: zrysnd <503591415@qq.com> Date: Sat, 20 Jul 2019 18:39:52 -0400 Subject: [PATCH 35/56] leave policy unchanged for now --- multiagent/policy.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/multiagent/policy.py b/multiagent/policy.py index 8d70f9aef..ff2e8997a 100644 --- a/multiagent/policy.py +++ b/multiagent/policy.py @@ -31,21 +31,21 @@ def action(self, obs): #x_axis = self.env.agents[self.agent_index].state.p_pos[0] #y_axis = self.env.agents[self.agent_index].state.p_pos[1] - if obs[2] < 0: - self.move[1] = True - elif obs[2] > 0: - self.move[0] = True - else: - self.move[0] = False - self.move[1] = False + # if obs[2] < 0: + # self.move[1] = True + # elif obs[2] > 0: + # self.move[0] = True + # else: + # self.move[0] = False + # self.move[1] = False - if obs[3] > 0: - self.move[3] = True - elif obs[3] < 0: - self.move[2] = True - else: - self.move[2] = False - self.move[3] = False + # if obs[3] > 0: + # self.move[3] = True + # elif obs[3] < 0: + # self.move[2] = True + # else: + # self.move[2] = False + # self.move[3] = False if self.env.discrete_action_input: From 4753e06700f42d5fb8406b90a00156c399734d41 Mon Sep 17 00:00:00 2001 From: zrysnd <503591415@qq.com> Date: Sat, 20 Jul 2019 18:44:00 -0400 Subject: [PATCH 36/56] environment no longer printing message, leave printing in script --- multiagent/environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multiagent/environment.py b/multiagent/environment.py index 69efccde1..86df92de0 100644 --- a/multiagent/environment.py +++ b/multiagent/environment.py @@ -210,7 +210,7 @@ def render(self, mode='human'): else: word = alphabet[np.argmax(other.state.c)] message += (other.name + ' to ' + agent.name + ': ' + word + ' ') - print(message) + # print(message) for i in range(len(self.viewers)): # create viewers (if necessary) From b15177410a3d509017ba1d99ec57bca1136bf3ad Mon Sep 17 00:00:00 2001 From: zrysnd <503591415@qq.com> Date: Sat, 20 Jul 2019 18:53:55 -0400 Subject: [PATCH 37/56] agent landmark position fixed --- multiagent/scenarios/cus.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/multiagent/scenarios/cus.py b/multiagent/scenarios/cus.py index 8b53fff0d..20c254caf 100644 --- a/multiagent/scenarios/cus.py +++ b/multiagent/scenarios/cus.py @@ -10,7 +10,7 @@ def __init__(self): def make_world(self): world = World() # add agents - numberOfAgents = 1; + numberOfAgents = 2; world.agents = [Agent() for i in range(numberOfAgents)] for i, agent in enumerate(world.agents): agent.name = 'agent %d' % i @@ -36,15 +36,17 @@ def reset_world(self, world): agent.color = np.array([0.25,0.25,0.25]) # random properties for landmarks for i, landmark in enumerate(world.landmarks): - landmark.color = np.array([0.75,0.75,0.75]) + landmark.color = np.array([0.75,0.25,0.25]) world.landmarks[0].color = np.array([0.75,0.25,0.25]) # set random initial states - for agent in world.agents: - agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) + for i,agent in enumerate(world.agents): + # agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) + agent.state.p_pos = np.array([i/2,0]) agent.state.p_vel = np.zeros(world.dim_p) agent.state.c = np.zeros(world.dim_c) for i, landmark in enumerate(world.landmarks): - landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) + # landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) + landmark.state.p_pos = np.array([i/2,0.75]) landmark.state.p_vel = np.zeros(world.dim_p) def reward(self, agent, world): From 1ff2b05785138ad422dbf412c9ff50cd948847cd Mon Sep 17 00:00:00 2001 From: zrysnd <503591415@qq.com> Date: Sat, 20 Jul 2019 18:57:19 -0400 Subject: [PATCH 38/56] more reward closer --- multiagent/scenarios/cus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multiagent/scenarios/cus.py b/multiagent/scenarios/cus.py index 20c254caf..c32da49e2 100644 --- a/multiagent/scenarios/cus.py +++ b/multiagent/scenarios/cus.py @@ -54,7 +54,7 @@ def reward(self, agent, world): # dist2 = world.landmarks[0].state.p_pos delta_pos = agent.state.p_pos - self.agentsToLandMarks[agent].state.p_pos dist = np.sqrt(np.sum(np.square(delta_pos))) - return dist + return -dist def observation(self, agent, world): # get positions of all entities in this agent's reference frame From 8fbb1bf421ad6875c497acccfe387ed41b397594 Mon Sep 17 00:00:00 2001 From: zrysnd <43715612+zrysnd@users.noreply.github.com> Date: Sat, 20 Jul 2019 19:04:22 -0400 Subject: [PATCH 39/56] documenting visualization --- documentation.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/documentation.md b/documentation.md index 4cdb2c007..731e40296 100644 --- a/documentation.md +++ b/documentation.md @@ -70,3 +70,8 @@ You can create new scenarios by implementing the first 4 functions above (`make_ - `./multiagent/rendering.py`: used for displaying agent behaviors on the screen. +## Visualization: + +1.Each agent will have one corresponding window generated for it, agents always locate at the center of the camera in its own wondow. +2.In the interactive policy, pressing -> will make the agent go left in the world, but everything else goes right in its window(since it's also at the center of its own window). + From c358b4b40489e559f39e523be8a213fa09160582 Mon Sep 17 00:00:00 2001 From: zrysnd <43715612+zrysnd@users.noreply.github.com> Date: Sat, 20 Jul 2019 19:06:37 -0400 Subject: [PATCH 40/56] documenting visualization --- documentation.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/documentation.md b/documentation.md index 731e40296..f7e697ccf 100644 --- a/documentation.md +++ b/documentation.md @@ -72,6 +72,6 @@ You can create new scenarios by implementing the first 4 functions above (`make_ ## Visualization: -1.Each agent will have one corresponding window generated for it, agents always locate at the center of the camera in its own wondow. -2.In the interactive policy, pressing -> will make the agent go left in the world, but everything else goes right in its window(since it's also at the center of its own window). +1. Each agent will have one corresponding window generated for itself, agents always locate at the center of the camera in its own wondow. +2. In the interactive policy, pressing -> will make the agent go left in the world, but everything else goes right in its own window(since it's always at the center of its own window). From 2c2d347f3cc618d4be161bef7b6b7178b23301ad Mon Sep 17 00:00:00 2001 From: Brin775 <43180128+Brin775@users.noreply.github.com> Date: Sun, 21 Jul 2019 13:02:52 -0400 Subject: [PATCH 41/56] Added race.py (not finished) --- multiagent/scenarios/race.py | 59 ++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 multiagent/scenarios/race.py diff --git a/multiagent/scenarios/race.py b/multiagent/scenarios/race.py new file mode 100644 index 000000000..f4a21b57f --- /dev/null +++ b/multiagent/scenarios/race.py @@ -0,0 +1,59 @@ +import numpy as np +from multiagent.core import World, Agent, Landmark +from multiagent.scenario import BaseScenario + +class Scenario(BaseScenario): + def make_world(self): + world = World() + # add agents + world.agents = [Agent() for i in range(2)] + for i, agent in enumerate(world.agents): + agent.name = 'agent %d' % i + agent.collide = False + agent.silent = True + # add landmarks + world.landmarks = [Landmark() for i in range(2)] + for i, landmark in enumerate(world.landmarks): + landmark.name = 'landmark %d' % i + landmark.collide = False + landmark.movable = False + # make initial conditions + self.reset_world(world) + return world + + def reset_world(self, world): + # random properties for agents + for i, agent in enumerate(world.agents): + agent.color = np.array([0.25,0.25,0.25]) + # random properties for landmarks + for i, landmark in enumerate(world.landmarks): + landmark.color = np.array([0.75,0.75,0.75]) + world.landmarks[0].color = np.array([0.75,0.25,0.25]) + world.landmarks[1].color = np.array([0.75,0.25,0.25]) + # set random initial states + #for agent in world.agents: + #agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) + world.agents[0].state.p_pos = np.array([0.0,0.0]) + world.agents[0].state.p_vel = np.zeros(world.dim_p) + world.agents[0].state.c = np.zeros(world.dim_c) + + world.agents[1].state.p_pos = np.array([0.5,0.0]) + world.agents[1].state.p_vel = np.zeros(world.dim_p) + world.agents[1].state.c = np.zeros(world.dim_c) + + for i, landmark in enumerate(world.landmarks): + #landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) + landmark.state.p_pos = np.array([0.0 + i*0.5, 5.0]) + landmark.state.p_vel = np.zeros(world.dim_p) + + def reward(self, agent, world): + #dist2 = np.sum(np.square(agent.state.p_pos - world.landmarks[0].state.p_pos)) + dist2 = np.sum(np.square(agent.state.p_pos - np.array([agent.state.p_pos[0], 5.0]))) + return -dist2 + + def observation(self, agent, world): + # get positions of all entities in this agent's reference frame + entity_pos = [] + for entity in world.landmarks: + entity_pos.append(entity.state.p_pos - agent.state.p_pos) + return np.concatenate([agent.state.p_vel] + entity_pos) From 927f504a13883123554f4e18abf398a3e9e98605 Mon Sep 17 00:00:00 2001 From: jarbus Date: Sun, 21 Jul 2019 13:22:24 -0400 Subject: [PATCH 42/56] Tweak scenarioideas.md --- ScenarioIdeas.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ScenarioIdeas.md b/ScenarioIdeas.md index 795d9709b..0e9f6169d 100644 --- a/ScenarioIdeas.md +++ b/ScenarioIdeas.md @@ -5,7 +5,7 @@ Generated at: https://www.tablesgenerator.com/markdown_tables | | Possible Actions | Rewards per Outcome | Properties of other entities | Nash Equilibrium | Other Notes | |----|-----------------------|--------------------------------------------------------------------------------------------------------------------------|-------------------------------------|-------------------------|-------------| | #1 | Expand, attack, trade | Expanding + attacking spends resources for greater resource bonuses later. Trading gives bonus resources for both agents | No other entities other than agents | Attack | | -| #2 | Move x steps | Agents try to close distance to flag | No other entities other than agents | Move as far as possible | | +| #2 | Move x steps | Reward = progress in last step | No other entities other than agents | Move as far as possible | | | #3 | | | | | | | #4 | | | | | | | #5 | | | | | | @@ -22,4 +22,7 @@ If A attacks B, spending 5 resources; B attacks A spending 6 resources, B takes Add defend action, which blocks attack, but opponent agent gains bigger bonus resource if they try to trade. ## Idea 2. Race -2D plane where agents try to race to their landmark. Agents can take any x number of steps to advance to the landmark. If the sum of all the steps taken by the agents(y) exceeds z, then all agents that moved get moved backwards w steps. Agents are rewarded for reaching the landmark. Parameters y, z, and the initial distance for each agent to the landmark can be varied for balance and to compare agent behavior. +2D plane where agents try to race to their landmark. Agents can take any x number of steps to advance to the landmark. If the sum of all the steps taken by the agents(y) exceeds z, then all agents that moved get moved backwards w steps. ~~Agents are rewarded for reaching the landmark~~ Agents are rewarded based off of how many steps they are able to take per turn. Parameters y, z, and the initial distance for each agent to the landmark can be varied for balance and to compare agent behavior. + +- The landmarks don't actually get taken into account for the rewards or observation, it's simply a visualization of how much progess each agent is able to make. +- Agents will either have to be moved by the scenario via physics, or they can move based off of the reward recieved on their next action, the following turn. From 2e9d4ec759d533494fc8fc52046da7fea7df90e1 Mon Sep 17 00:00:00 2001 From: jarbus Date: Sun, 21 Jul 2019 14:26:11 -0400 Subject: [PATCH 43/56] race tweaks --- ScenarioIdeas.md | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/ScenarioIdeas.md b/ScenarioIdeas.md index 0e9f6169d..82e8e2e58 100644 --- a/ScenarioIdeas.md +++ b/ScenarioIdeas.md @@ -2,13 +2,14 @@ Generated at: https://www.tablesgenerator.com/markdown_tables -| | Possible Actions | Rewards per Outcome | Properties of other entities | Nash Equilibrium | Other Notes | -|----|-----------------------|--------------------------------------------------------------------------------------------------------------------------|-------------------------------------|-------------------------|-------------| -| #1 | Expand, attack, trade | Expanding + attacking spends resources for greater resource bonuses later. Trading gives bonus resources for both agents | No other entities other than agents | Attack | | -| #2 | Move x steps | Reward = progress in last step | No other entities other than agents | Move as far as possible | | -| #3 | | | | | | -| #4 | | | | | | -| #5 | | | | | | +| | Possible Actions | Rewards per Outcome | Properties of other entities | Nash Equilibrium | Other Notes | +| ---- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------- | ------------------------------------- | ------------------------- | ------------- | +| #1 | Expand, attack, trade | Expanding + attacking spends resources for greater resource bonuses later. | No other entities other than agents | Attack | | +| | | Trading gives bonus resources for both agents | | | | +| #2 | Move x steps | Reward = progress in last step | No other entities other than agents | Move as far as possible | | +| #3 | | | | | | +| #4 | | | | | | +| #5 | | | | | | # Details: ## Idea 1. (Risk but on a grid) @@ -22,7 +23,21 @@ If A attacks B, spending 5 resources; B attacks A spending 6 resources, B takes Add defend action, which blocks attack, but opponent agent gains bigger bonus resource if they try to trade. ## Idea 2. Race -2D plane where agents try to race to their landmark. Agents can take any x number of steps to advance to the landmark. If the sum of all the steps taken by the agents(y) exceeds z, then all agents that moved get moved backwards w steps. ~~Agents are rewarded for reaching the landmark~~ Agents are rewarded based off of how many steps they are able to take per turn. Parameters y, z, and the initial distance for each agent to the landmark can be varied for balance and to compare agent behavior. -- The landmarks don't actually get taken into account for the rewards or observation, it's simply a visualization of how much progess each agent is able to make. +#### World +- 2D plane where agents try to race to their landmark. +- Agents only move forward/backward parallel to each other +- All agents start from the same location + +#### Rules +- Agents can take any x number of steps to advance to the landmark. +- If the sum of all the steps taken by the agents(y) exceeds z, then all agents that moved get moved backwards w steps. +- Agents are rewarded based off of how many steps they are successfully able to take per turn. + - Say each agent tries to take 10 steps, but because too many agents are trying to move in a turn, they all get moved back 5 steps. In this case, their reward would be -5, even though they tried to take 10 steps. + +#### Variables +- Parameters y, z, and the initial distance for each agent to the landmark can be varied for balance and to compare agent behavior. + +#### Notes +- The landmarks don't actually get taken into account for the rewards or observation, it's simply aiding visualization of how much progess each agent is able to make. - Agents will either have to be moved by the scenario via physics, or they can move based off of the reward recieved on their next action, the following turn. From 2c178b702640e6eeefeb7d0b6fd77797fbadf627 Mon Sep 17 00:00:00 2001 From: dpakalarry Date: Mon, 22 Jul 2019 16:58:28 -0400 Subject: [PATCH 44/56] Commenting --- multiagent/scenarios/testing.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/multiagent/scenarios/testing.py b/multiagent/scenarios/testing.py index 3ed98c815..80324dc0a 100644 --- a/multiagent/scenarios/testing.py +++ b/multiagent/scenarios/testing.py @@ -71,6 +71,7 @@ def good_agents(self, world): # return [agent for agent in world.agents if agent.adversary] def reward(self, agent, world): + return np.sum(np.s) return self.agent_reward(agent,world) # Agents are rewarded based on minimum agent distance to each landmark # return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) @@ -80,15 +81,15 @@ def agent_reward(self, agent, world): #TODO: set reward to distance to their l shaped_reward = True shaped_adv_reward = True - # Calculate negative reward for adversary - adversary_agents = self.adversaries(world) - if shaped_adv_reward: # distance-based adversary reward - adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents]) - else: # proximity-based adversary reward (binary) - adv_rew = 0 - for a in adversary_agents: - if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size: - adv_rew -= 5 + # # Calculate negative reward for adversary + # adversary_agents = self.adversaries(world) + # if shaped_adv_reward: # distance-based adversary reward + # adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents]) + # else: # proximity-based adversary reward (binary) + # adv_rew = 0 + # for a in adversary_agents: + # if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size: + # adv_rew -= 5 # Calculate positive reward for agents good_agents = self.good_agents(world) From 4f2b97c6a5767733b4527e87a185831d4c46bc67 Mon Sep 17 00:00:00 2001 From: dpakalarry Date: Mon, 22 Jul 2019 20:00:44 -0400 Subject: [PATCH 45/56] Setup testing.py for the scenario --- multiagent/scenarios/testing.py | 108 +++++++++++++++++--------------- 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/multiagent/scenarios/testing.py b/multiagent/scenarios/testing.py index 80324dc0a..3cefa0755 100644 --- a/multiagent/scenarios/testing.py +++ b/multiagent/scenarios/testing.py @@ -4,7 +4,7 @@ class Scenario(BaseScenario): - + goalDist = 5.0; def make_world(self): world = World() #World has agents and landmarks # set any world properties first @@ -48,7 +48,7 @@ def reset_world(self, world): agent.state.p_vel = 0 agent.state.c = np.zeros(world.dim_c) for i, landmark in enumerate(world.landmarks): - landmark.state.p_pos = np.array([i,5]) + landmark.state.p_pos = np.array([i,goalDist]) landmark.state.p_vel = 0 def benchmark_data(self, agent, world): @@ -70,69 +70,73 @@ def good_agents(self, world): # def adversaries(self, world): # return [agent for agent in world.agents if agent.adversary] + #Simplified to just distance from y = 5; def reward(self, agent, world): - return np.sum(np.s) - return self.agent_reward(agent,world) + alpha = 0.5 + return alpha * agent.state.p_pos[1] - (1-alpha) * 1/(world.num_agents-1)*sum([other.state.p_pos[1] for other in world.agents if other is not agent]) + #Right now + for distance - average of the distance covered by other agents. + + + # return self.agent_reward(agent,world) # Agents are rewarded based on minimum agent distance to each landmark # return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) - def agent_reward(self, agent, world): #TODO: set reward to distance to their landmark, remove adversary stuff - # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it - shaped_reward = True - shaped_adv_reward = True - - # # Calculate negative reward for adversary - # adversary_agents = self.adversaries(world) - # if shaped_adv_reward: # distance-based adversary reward - # adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents]) - # else: # proximity-based adversary reward (binary) - # adv_rew = 0 - # for a in adversary_agents: - # if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size: - # adv_rew -= 5 - - # Calculate positive reward for agents - good_agents = self.good_agents(world) - if shaped_reward: # distance-based agent reward - pos_rew = -min( - [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) - else: # proximity-based agent reward (binary) - pos_rew = 0 - if min([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) \ - < 2 * agent.goal_a.size: - pos_rew += 5 - pos_rew -= min( - [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) - return pos_rew + adv_rew #Rewards are a simple int + # def agent_reward(self, agent, world): #TODO: set reward to distance to their landmark, remove adversary stuff + # # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it + # shaped_reward = True + # shaped_adv_reward = True + + # # # Calculate negative reward for adversary + # # adversary_agents = self.adversaries(world) + # # if shaped_adv_reward: # distance-based adversary reward + # # adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents]) + # # else: # proximity-based adversary reward (binary) + # # adv_rew = 0 + # # for a in adversary_agents: + # # if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size: + # # adv_rew -= 5 + + # # Calculate positive reward for agents + # good_agents = self.good_agents(world) + # if shaped_reward: # distance-based agent reward + # pos_rew = -min( + # [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) + # else: # proximity-based agent reward (binary) + # pos_rew = 0 + # if min([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) \ + # < 2 * agent.goal_a.size: + # pos_rew += 5 + # pos_rew -= min( + # [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) + # return pos_rew + adv_rew #Rewards are a simple int #Adversaries are given rewards - def adversary_reward(self, agent, world): - # Rewarded based on proximity to the goal landmark - shaped_reward = True - if shaped_reward: # distance-based reward - return -np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) - else: # proximity-based reward (binary) - adv_rew = 0 - if np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) < 2 * agent.goal_a.size: - adv_rew += 5 - return adv_rew + # def adversary_reward(self, agent, world): + # # Rewarded based on proximity to the goal landmark + # shaped_reward = True + # if shaped_reward: # distance-based reward + # return -np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) + # else: # proximity-based reward (binary) + # adv_rew = 0 + # if np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) < 2 * agent.goal_a.size: + # adv_rew += 5 + # return adv_rew #What is passed to the agent ie How they see the world def observation(self, agent, world): # get positions of all entities in this agent's reference frame - entity_pos = [] - for entity in world.landmarks: - entity_pos.append(entity.state.p_pos - agent.state.p_pos) - # entity colors - entity_color = [] - for entity in world.landmarks: - entity_color.append(entity.color) - # communication of all other Agents TODO: remove communication + # entity_pos = [] + # for entity in world.landmarks: + # entity_pos.append(entity.state.p_pos - agent.state.p_pos) + entity_pos = [goalDist - agent.state.p_pos[1]] #Should only need the distance to it's own landmark goal + + + # communication of all other Agents other_pos = [] for other in world.agents: - if other is agent: continue - other_pos.append(other.state.p_pos - agent.state.p_pos) + # if other is agent: continue + other_pos.append(goalDist - other.state.p_pos[1]) #Agents know how far other agents are from their goals if not agent.adversary: return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + entity_pos + other_pos) From 7f59b61a0e0c5d3fa04b798e071df7d51c9edd0f Mon Sep 17 00:00:00 2001 From: dpakalarry Date: Mon, 22 Jul 2019 20:01:22 -0400 Subject: [PATCH 46/56] Added comments --- multiagent/scenarios/testing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/multiagent/scenarios/testing.py b/multiagent/scenarios/testing.py index 3cefa0755..51c2d4f20 100644 --- a/multiagent/scenarios/testing.py +++ b/multiagent/scenarios/testing.py @@ -4,7 +4,8 @@ class Scenario(BaseScenario): - goalDist = 5.0; + goalDist = 5.0; #Currently the distance to landmark + def make_world(self): world = World() #World has agents and landmarks # set any world properties first @@ -72,7 +73,7 @@ def good_agents(self, world): #Simplified to just distance from y = 5; def reward(self, agent, world): - alpha = 0.5 + alpha = 0.5 #Can be adjusted to determine whether individual performance, or ranked importance is more important [0,1] return alpha * agent.state.p_pos[1] - (1-alpha) * 1/(world.num_agents-1)*sum([other.state.p_pos[1] for other in world.agents if other is not agent]) #Right now + for distance - average of the distance covered by other agents. From 1f97b16821f66e98ac1e8fdc3a4946ab6585fbcd Mon Sep 17 00:00:00 2001 From: SimplySonder <46611486+SimplySonder@users.noreply.github.com> Date: Wed, 24 Jul 2019 12:28:47 -0400 Subject: [PATCH 47/56] Added Idea 3 to ScenarioIdeas.md Anthony added first draft of Hunger games Scenario --- ScenarioIdeas.md | 56 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/ScenarioIdeas.md b/ScenarioIdeas.md index 82e8e2e58..217c4e80a 100644 --- a/ScenarioIdeas.md +++ b/ScenarioIdeas.md @@ -7,7 +7,7 @@ Generated at: https://www.tablesgenerator.com/markdown_tables | #1 | Expand, attack, trade | Expanding + attacking spends resources for greater resource bonuses later. | No other entities other than agents | Attack | | | | | Trading gives bonus resources for both agents | | | | | #2 | Move x steps | Reward = progress in last step | No other entities other than agents | Move as far as possible | | -| #3 | | | | | | +| #3 | move, attack, loot, rest | Staying alive, get gear, kills, winning | No other entities other than agents | live and loot | | | #4 | | | | | | | #5 | | | | | | @@ -41,3 +41,57 @@ Add defend action, which blocks attack, but opponent agent gains bigger bonus re #### Notes - The landmarks don't actually get taken into account for the rewards or observation, it's simply aiding visualization of how much progess each agent is able to make. - Agents will either have to be moved by the scenario via physics, or they can move based off of the reward recieved on their next action, the following turn. + +## Idea 3. Hunger Games + +#### World +- 10 x 10 plane where agents try to be the last survivor +- 12 agents start equidistant from each other in a circle + - Middle of circle is high tier loot +- Set loot spawns with a set tier, but random loot +- Structures agents can enter and be hidden from sight + +#### Agent +- Main Attributes + - Attack Range + - Attack Power + - Def + - HP + - Stamina +- Choices: + - Loot + - Sight limited + - Attack an adjacent agent + - Additional attack options possible with certain loot + - Uses Stamina + - Move + - Walk one space + - Run 2x fast w/ Stamina + - Rest + - Recover HP/Stamina +#### Rewards +- Kills are not intrinsically rewarded +- Looting from chests/bodies result in a set reward value per tier/killcount, and additional reward from net stat gain +- Time alive gives slight reward with each tick +- Winning gives the highest reward + +#### Agent Variables +- Environment Knowledge + - Excludes: + Chest loot status + Chest loot items + Alive/Dead Enemy location +- Sight +- Self position +- Attributes +- Loot +- Kill count +- Kill counts of other agents +- List of Alive Agents +- List of Dead Agents +- Attributes of agents in Sight + +#### Notes +- Co-op can be implemented where agents spawn with a teammate they cannot attack, and exchange loot with. +- Combat can function similarly to D&D involving some RNG + From cbfea477097d6ecd4a22671519ec894ca8339e63 Mon Sep 17 00:00:00 2001 From: linlinbest <43051929+linlinbest@users.noreply.github.com> Date: Wed, 24 Jul 2019 12:56:14 -0400 Subject: [PATCH 48/56] Add files via upload The template for Q-learing algorithm in interactive_tmp.py, but there are still some bugs. --- .../gymMountainCarv0/interactive_tmp.py | 84 ++++++++++++++++++ agents_using_gym/gymMountainCarv0/policy.py | 88 +++++++++++++++++++ 2 files changed, 172 insertions(+) create mode 100644 agents_using_gym/gymMountainCarv0/interactive_tmp.py create mode 100644 agents_using_gym/gymMountainCarv0/policy.py diff --git a/agents_using_gym/gymMountainCarv0/interactive_tmp.py b/agents_using_gym/gymMountainCarv0/interactive_tmp.py new file mode 100644 index 000000000..858378dec --- /dev/null +++ b/agents_using_gym/gymMountainCarv0/interactive_tmp.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +import os,sys +sys.path.insert(1, os.path.join(sys.path[0], '..')) +import argparse + +from multiagent.environment import MultiAgentEnv +from multiagent.policy import InteractivePolicy +import multiagent.scenarios as scenarios + +import numpy as np + +if __name__ == '__main__': + # parse arguments + parser = argparse.ArgumentParser(description=None) + parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.') + args = parser.parse_args() + + # load scenario from script + scenario = scenarios.load(args.scenario).Scenario() + # create world + world = scenario.make_world() + # create multiagent environment + env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer = False) + # render call to create viewer window (necessary only for interactive policies) + env.render() + # create interactive policies for each agent + policies = [InteractivePolicy(env,i) for i in range(env.n)] + + LEARNING_RATE = 0.1 + DISCOUNT = 0.95 + EPISODE = 25000 + + def get_discrete_state(state): + #DISCRETE_OBS_SPACE = [20] * len(state) + high_bound = np.array([1] * len(state)) + low_bound = np.array([-1] * len(state)) + obs_win_size = (high_bound-low_bound) / ([20]* len(state)) + discrete_state = np.subtract(state, low_bound)/ obs_win_size + #print(discrete_state.astype(np.float)) + # we use this tuple to look up the 3 Q values for the available actions in the q-table + return tuple(discrete_state.astype(np.int)) + + # execution loop + obs_n = env.reset() + + #a list of q_tables (one q_table for each agent) + + DISCRETE_OBS_SPACE = [20] * len(obs_n[0]) + q_tables = [] + for i in range(env.n): + q_tables.append(np.random.uniform(low=-3, high=3, size=(DISCRETE_OBS_SPACE + [4]))) + q_tables = np.array(q_tables) + #print(q_tables) + + + #for i in range(EPISODE): do the following + obs_n = env.reset() + while True: + # query for action from each agent's policy + act_n = [] + for i, policy in enumerate(policies): + act_n.append(policy.action(obs_n[i])) + new_discrete_state = get_discrete_state(obs_n[i]) + + print(act_n) + #print(obs_n) + # step environment + obs_n, reward_n, done_n, _ = env.step(act_n) + # render all agent views + env.render() + # display rewards + #for agent in env.world.agents: + # print(agent.name + " reward: %0.3f" % env._get_reward(agent)) + + + if True: + for i, policy in enumerate(policies): + #print(q_tables[tuple([0])+(new_discrete_state,)]) + max_future_q = np.max(q_tables[tuple([i])+new_discrete_state]) + current_q = q_tables[tuple([i])+new_discrete_state] + new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward_n[i] + DISCOUNT * max_future_q) + q_tables[tuple([i])+ new_discrete_state+(act_n[i], )] = new_q + + diff --git a/agents_using_gym/gymMountainCarv0/policy.py b/agents_using_gym/gymMountainCarv0/policy.py new file mode 100644 index 000000000..d8b7ef8ce --- /dev/null +++ b/agents_using_gym/gymMountainCarv0/policy.py @@ -0,0 +1,88 @@ +import numpy as np +from pyglet.window import key + +from multiagent.scenarios.simple import Scenario + +# individual agent policy +class Policy(object): + def __init__(self): + self.move = [False for i in range(4)] + def action(self, obs): + #agent = env.agents + raise NotImplementedError() + +# interactive policy based on keyboard input +# hard-coded to deal only with movement, not communication +class InteractivePolicy(Policy): + def __init__(self, env, agent_index): + super(InteractivePolicy, self).__init__() + self.env = env + #self.agent_index = agent_index + # hard-coded keyboard events + self.move = [False for i in range(4)] + self.comm = [False for i in range(env.world.dim_c)] + # register keyboard events with this environment's window + env.viewers[agent_index].window.on_key_press = self.key_press + env.viewers[agent_index].window.on_key_release = self.key_release + + def action(self, obs): + # ignore observation and just act based on keyboard events + + + #x_axis = self.env.agents[self.agent_index].state.p_pos[0] + #y_axis = self.env.agents[self.agent_index].state.p_pos[1] + + ''' + If we try to implement Q-learning in Interactive.action(self, obs), + we may first need to have a get_reward() function for each agent. + + Or a simpler way is to have Interactive.action(self, obs) return the action space + each time. Then implement the Q-learning algorithm in bin/interactive.py since interactive.py have access to everything + and it's more convinient to implement. + ''' + + #obs[2] is the x-axis of the relative position between first landmark and the agent + if obs[2] < 0: + self.move[1] = True + elif obs[2] > 0: + self.move[0] = True + else: + self.move[0] = False + self.move[1] = False + + if obs[3] > 0: + self.move[3] = True + elif obs[3] < 0: + self.move[2] = True + else: + self.move[2] = False + self.move[3] = False + + + if self.env.discrete_action_input: + u = 0 + if self.move[0]: u = 1 + if self.move[1]: u = 2 + if self.move[2]: u = 4 + if self.move[3]: u = 3 + else: + u = np.zeros(5) # 5-d because of no-move action + if self.move[0]: u[1] += 1.0 + if self.move[1]: u[2] += 1.0 + if self.move[3]: u[3] += 1.0 + if self.move[2]: u[4] += 1.0 + if True not in self.move: + u[0] += 1.0 + return np.concatenate([u, np.zeros(self.env.world.dim_c)]) + + # keyboard event callbacks + def key_press(self, k, mod): + if k==key.LEFT: self.move[0] = True + if k==key.RIGHT: self.move[1] = True + if k==key.UP: self.move[2] = True + if k==key.DOWN: self.move[3] = True + def key_release(self, k, mod): + if k==key.LEFT: self.move[0] = False + if k==key.RIGHT: self.move[1] = False + if k==key.UP: self.move[2] = False + if k==key.DOWN: self.move[3] = False From df8bf17c7c0acffb5ba4773e1351ce89b97d2840 Mon Sep 17 00:00:00 2001 From: syhdd <45134514+syhdd@users.noreply.github.com> Date: Wed, 24 Jul 2019 13:16:42 -0400 Subject: [PATCH 49/56] Update ScenarioIdeas.md --- ScenarioIdeas.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ScenarioIdeas.md b/ScenarioIdeas.md index 217c4e80a..83aafb2e0 100644 --- a/ScenarioIdeas.md +++ b/ScenarioIdeas.md @@ -95,3 +95,20 @@ Add defend action, which blocks attack, but opponent agent gains bigger bonus re - Co-op can be implemented where agents spawn with a teammate they cannot attack, and exchange loot with. - Combat can function similarly to D&D involving some RNG +## Idea 4. Warship Survival Game +The Idea is extended from the hunger game +#### World +100 x 100 plane where agents try to be the last survivor +Every agent was assigned 5 blocks as ships in the plane. +when every blocks are eliminated, the agents are terminated +The agent has sight within 5 blocks which it can attack +#### Main Attributes: +points: every round each agent is assigned 5 points +Attack: use 2 point to attack a block in the 2-D plane(no range limitation) +Move: use 1 point to move one ship into nearby block +generate new ships:use 4 points put an new ship into plane +#### Rewards +Kills are rewarded(granted points or not) +ships are rewarded(one ship is worth 1 point) +#### Notes +Co-op can be implemented in the way that share sight From 447ac7b6afb25421c10f18fe1b253a4bda6d2bac Mon Sep 17 00:00:00 2001 From: syhdd <45134514+syhdd@users.noreply.github.com> Date: Wed, 24 Jul 2019 13:17:17 -0400 Subject: [PATCH 50/56] Update ScenarioIdeas.md --- ScenarioIdeas.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/ScenarioIdeas.md b/ScenarioIdeas.md index 83aafb2e0..9ff5c1ab4 100644 --- a/ScenarioIdeas.md +++ b/ScenarioIdeas.md @@ -98,17 +98,17 @@ Add defend action, which blocks attack, but opponent agent gains bigger bonus re ## Idea 4. Warship Survival Game The Idea is extended from the hunger game #### World -100 x 100 plane where agents try to be the last survivor -Every agent was assigned 5 blocks as ships in the plane. -when every blocks are eliminated, the agents are terminated -The agent has sight within 5 blocks which it can attack +- 100 x 100 plane where agents try to be the last survivor +- Every agent was assigned 5 blocks as ships in the plane. +- when every blocks are eliminated, the agents are terminated +- The agent has sight within 5 blocks which it can attack #### Main Attributes: -points: every round each agent is assigned 5 points -Attack: use 2 point to attack a block in the 2-D plane(no range limitation) -Move: use 1 point to move one ship into nearby block -generate new ships:use 4 points put an new ship into plane +- points: every round each agent is assigned 5 points +- Attack: use 2 point to attack a block in the 2-D plane(no range limitation) +- Move: use 1 point to move one ship into nearby block +- generate new ships:use 4 points put an new ship into plane #### Rewards -Kills are rewarded(granted points or not) -ships are rewarded(one ship is worth 1 point) +- Kills are rewarded(granted points or not) +- ships are rewarded(one ship is worth 1 point) #### Notes -Co-op can be implemented in the way that share sight +- Co-op can be implemented in the way that share sight From 6ffece211cf0a1092cbb75c98854600792cf6353 Mon Sep 17 00:00:00 2001 From: syhdd <45134514+syhdd@users.noreply.github.com> Date: Wed, 24 Jul 2019 13:20:18 -0400 Subject: [PATCH 51/56] Add new Scenario Idea This Idea is a simplified and extended version from hunger game --- ScenarioIdeas.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ScenarioIdeas.md b/ScenarioIdeas.md index 9ff5c1ab4..d04cf4759 100644 --- a/ScenarioIdeas.md +++ b/ScenarioIdeas.md @@ -111,4 +111,5 @@ The Idea is extended from the hunger game - Kills are rewarded(granted points or not) - ships are rewarded(one ship is worth 1 point) #### Notes -- Co-op can be implemented in the way that share sight +- Co-op can be implemented in the way that share sight with allay +- Co-op can still attack each other From d01ecf0b6382f92f71999882639027f60fe7bbf6 Mon Sep 17 00:00:00 2001 From: zrysnd <503591415@qq.com> Date: Wed, 24 Jul 2019 13:33:04 -0400 Subject: [PATCH 52/56] reward based on cheat/cooperate --- multiagent/scenarios/race.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/multiagent/scenarios/race.py b/multiagent/scenarios/race.py index f4a21b57f..2db68db46 100644 --- a/multiagent/scenarios/race.py +++ b/multiagent/scenarios/race.py @@ -47,9 +47,20 @@ def reset_world(self, world): landmark.state.p_vel = np.zeros(world.dim_p) def reward(self, agent, world): - #dist2 = np.sum(np.square(agent.state.p_pos - world.landmarks[0].state.p_pos)) - dist2 = np.sum(np.square(agent.state.p_pos - np.array([agent.state.p_pos[0], 5.0]))) - return -dist2 + # dist2 = np.sum(np.square(agent.state.p_pos - np.array([agent.state.p_pos[0], 5.0]))) + agentCheated = False + theOtherAgentCheated = False + if agentCheated and theOtherAgentCheated: + return 1 + if agentCheated and !theOtherAgentCheated: + return 5 + if !agentCheated and theOtherAgentCheated: + return -3 + else: + return 3 + # if !agentCheated and !theOtherAgentCheated: + # return 3 + # return -dist2 def observation(self, agent, world): # get positions of all entities in this agent's reference frame From 3e8fc138227c8f21689a4642da9e7dda8456c619 Mon Sep 17 00:00:00 2001 From: Jarbus Date: Wed, 24 Jul 2019 19:28:29 -0400 Subject: [PATCH 53/56] update readme --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index aa9bd3e8a..d07408ccc 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -**Status:** Archive (code is provided as-is, no updates expected) +**Status:** Code in active development for a class project at the Rensselaer Polytechnic Institute + +- Most team communication done via mattermost at chat.rcos.io # Multi-Agent Particle Environment From 250d49aa810c9244e3756b34a470577c8f882aa4 Mon Sep 17 00:00:00 2001 From: jarbus Date: Thu, 25 Jul 2019 22:05:58 -0400 Subject: [PATCH 54/56] clean up for pull --- README.md | 4 +- ScenarioIdeas.md | 115 -------------- agents_using_gym/gymMountainCarv0/README.md | 12 -- agents_using_gym/gymMountainCarv0/cheating.py | 13 -- .../gymMountainCarv0/interactive_tmp.py | 84 ---------- agents_using_gym/gymMountainCarv0/policy.py | 88 ----------- .../gymMountainCarv0/simpleqlearning.py | 62 -------- changes.txt | 23 --- documentation.md | 77 ---------- multiagent/scenarios/cus.py | 64 -------- multiagent/scenarios/race.py | 70 --------- multiagent/scenarios/testing.py | 145 ------------------ 12 files changed, 1 insertion(+), 756 deletions(-) delete mode 100644 ScenarioIdeas.md delete mode 100644 agents_using_gym/gymMountainCarv0/README.md delete mode 100644 agents_using_gym/gymMountainCarv0/cheating.py delete mode 100644 agents_using_gym/gymMountainCarv0/interactive_tmp.py delete mode 100644 agents_using_gym/gymMountainCarv0/policy.py delete mode 100644 agents_using_gym/gymMountainCarv0/simpleqlearning.py delete mode 100644 changes.txt delete mode 100644 documentation.md delete mode 100644 multiagent/scenarios/cus.py delete mode 100644 multiagent/scenarios/race.py delete mode 100644 multiagent/scenarios/testing.py diff --git a/README.md b/README.md index d07408ccc..aa9bd3e8a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,4 @@ -**Status:** Code in active development for a class project at the Rensselaer Polytechnic Institute - -- Most team communication done via mattermost at chat.rcos.io +**Status:** Archive (code is provided as-is, no updates expected) # Multi-Agent Particle Environment diff --git a/ScenarioIdeas.md b/ScenarioIdeas.md deleted file mode 100644 index d04cf4759..000000000 --- a/ScenarioIdeas.md +++ /dev/null @@ -1,115 +0,0 @@ -# Idea table: - -Generated at: https://www.tablesgenerator.com/markdown_tables - -| | Possible Actions | Rewards per Outcome | Properties of other entities | Nash Equilibrium | Other Notes | -| ---- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------- | ------------------------------------- | ------------------------- | ------------- | -| #1 | Expand, attack, trade | Expanding + attacking spends resources for greater resource bonuses later. | No other entities other than agents | Attack | | -| | | Trading gives bonus resources for both agents | | | | -| #2 | Move x steps | Reward = progress in last step | No other entities other than agents | Move as far as possible | | -| #3 | move, attack, loot, rest | Staying alive, get gear, kills, winning | No other entities other than agents | live and loot | | -| #4 | | | | | | -| #5 | | | | | | - -# Details: -## Idea 1. (Risk but on a grid) -Grid based cell game, each agent starts with 1 cell on some part of the grid. Agents use resources to expand, attack, or trade with neighboring cells. Every turn agents gain a set amount of resources based on area of agent's cells. For every neighboring cell, if it is not occupied, the agent can choose to spend resources to expand into the area, or not. If the cell is occupied, the agent can choose to attack, or trade. Attacking allows for the takeover of the cell and requires the agent to spend resources. Trading requires the agent to give resources to the other agent, but if both agents decide to trade, they can recieve some bonus based on who gave more resources. If one agent attacks, and the other trades, the attacker automatically wins. If both attack, the agent that spent more resources to attack wins. Resource costs and bonuses can be tweaked to ensure fairness and balance. -### Examples: -Agent A and Agent B are neighbors: if A trades 2 resources, and B trades 4 resources, A could gain 4(from B) + 2(bonus includes how much given) + 1(some multiplier of how much was given in this case 0.5 for giving less) resulting in net +5, B would gain 2(from A) + 4(given) + 4(multiplier bonus of 1 for giving more) resulting in +6 \ -If A attacks B, spending 5 resources; B attempts to trade 4 resources, A takes over some area of B and gains 4 resources from B's trade with net gain of -1 resource and + some area; B has a net gain of -4 resources and -some area. \ -If A attacks B, spending 5 resources; B attacks A spending 6 resources, B takes some area of A. A has a net gain of -5 resources and -some area; B has a net gain of -5 resources and -some area. - -### Possible expansion: -Add defend action, which blocks attack, but opponent agent gains bigger bonus resource if they try to trade. - -## Idea 2. Race - -#### World -- 2D plane where agents try to race to their landmark. -- Agents only move forward/backward parallel to each other -- All agents start from the same location - -#### Rules -- Agents can take any x number of steps to advance to the landmark. -- If the sum of all the steps taken by the agents(y) exceeds z, then all agents that moved get moved backwards w steps. -- Agents are rewarded based off of how many steps they are successfully able to take per turn. - - Say each agent tries to take 10 steps, but because too many agents are trying to move in a turn, they all get moved back 5 steps. In this case, their reward would be -5, even though they tried to take 10 steps. - -#### Variables -- Parameters y, z, and the initial distance for each agent to the landmark can be varied for balance and to compare agent behavior. - -#### Notes -- The landmarks don't actually get taken into account for the rewards or observation, it's simply aiding visualization of how much progess each agent is able to make. -- Agents will either have to be moved by the scenario via physics, or they can move based off of the reward recieved on their next action, the following turn. - -## Idea 3. Hunger Games - -#### World -- 10 x 10 plane where agents try to be the last survivor -- 12 agents start equidistant from each other in a circle - - Middle of circle is high tier loot -- Set loot spawns with a set tier, but random loot -- Structures agents can enter and be hidden from sight - -#### Agent -- Main Attributes - - Attack Range - - Attack Power - - Def - - HP - - Stamina -- Choices: - - Loot - - Sight limited - - Attack an adjacent agent - - Additional attack options possible with certain loot - - Uses Stamina - - Move - - Walk one space - - Run 2x fast w/ Stamina - - Rest - - Recover HP/Stamina -#### Rewards -- Kills are not intrinsically rewarded -- Looting from chests/bodies result in a set reward value per tier/killcount, and additional reward from net stat gain -- Time alive gives slight reward with each tick -- Winning gives the highest reward - -#### Agent Variables -- Environment Knowledge - - Excludes: - Chest loot status - Chest loot items - Alive/Dead Enemy location -- Sight -- Self position -- Attributes -- Loot -- Kill count -- Kill counts of other agents -- List of Alive Agents -- List of Dead Agents -- Attributes of agents in Sight - -#### Notes -- Co-op can be implemented where agents spawn with a teammate they cannot attack, and exchange loot with. -- Combat can function similarly to D&D involving some RNG - -## Idea 4. Warship Survival Game -The Idea is extended from the hunger game -#### World -- 100 x 100 plane where agents try to be the last survivor -- Every agent was assigned 5 blocks as ships in the plane. -- when every blocks are eliminated, the agents are terminated -- The agent has sight within 5 blocks which it can attack -#### Main Attributes: -- points: every round each agent is assigned 5 points -- Attack: use 2 point to attack a block in the 2-D plane(no range limitation) -- Move: use 1 point to move one ship into nearby block -- generate new ships:use 4 points put an new ship into plane -#### Rewards -- Kills are rewarded(granted points or not) -- ships are rewarded(one ship is worth 1 point) -#### Notes -- Co-op can be implemented in the way that share sight with allay -- Co-op can still attack each other diff --git a/agents_using_gym/gymMountainCarv0/README.md b/agents_using_gym/gymMountainCarv0/README.md deleted file mode 100644 index 8a2475783..000000000 --- a/agents_using_gym/gymMountainCarv0/README.md +++ /dev/null @@ -1,12 +0,0 @@ -## This folders incude some agents for gym's mountain car environment. -## The codes in this folder are using Python 3.6.1, gym==0.13.1,numpy==1.16.4. The codes are using some functions from gym==0.13.1 which are not implemented in gym==0.10.5, so please upgrade your gym before running these codes. -### If you don't know how to upgrade gym: -``` -pip uninstall gym -pip install gym -``` -## Python files -### These files are just using gym, and can be run by ```python filename.py``` (or ```python3 filename.py``` if you are using linux.) IDEs shold be able to run them as well. -### cheating.py is a straight solution by Mark Yu after 2 seconds of thinking, it represents Mark's superiority against AI. JK. -### simpleqlearning.py is an implementation of qlearning, an algorithm that Mark learnt from wikipedia [https://en.wikipedia.org/wiki/Q-learning](https://en.wikipedia.org/wiki/Q-learning). Feel free to mess with the learning rate and discountrate in the code and compare the time it takes for the AI to learn how to push the car to the summit. - diff --git a/agents_using_gym/gymMountainCarv0/cheating.py b/agents_using_gym/gymMountainCarv0/cheating.py deleted file mode 100644 index c67572d6c..000000000 --- a/agents_using_gym/gymMountainCarv0/cheating.py +++ /dev/null @@ -1,13 +0,0 @@ -import gym -env = gym.make("MountainCar-v0") - -done=False -state=env.reset() -while not done: - if state[1]<=0: - state, reward, done,info = env.step(0) - else: - state, reward, done,info = env.step(2) - env.render() - -env.close() \ No newline at end of file diff --git a/agents_using_gym/gymMountainCarv0/interactive_tmp.py b/agents_using_gym/gymMountainCarv0/interactive_tmp.py deleted file mode 100644 index 858378dec..000000000 --- a/agents_using_gym/gymMountainCarv0/interactive_tmp.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python -import os,sys -sys.path.insert(1, os.path.join(sys.path[0], '..')) -import argparse - -from multiagent.environment import MultiAgentEnv -from multiagent.policy import InteractivePolicy -import multiagent.scenarios as scenarios - -import numpy as np - -if __name__ == '__main__': - # parse arguments - parser = argparse.ArgumentParser(description=None) - parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.') - args = parser.parse_args() - - # load scenario from script - scenario = scenarios.load(args.scenario).Scenario() - # create world - world = scenario.make_world() - # create multiagent environment - env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer = False) - # render call to create viewer window (necessary only for interactive policies) - env.render() - # create interactive policies for each agent - policies = [InteractivePolicy(env,i) for i in range(env.n)] - - LEARNING_RATE = 0.1 - DISCOUNT = 0.95 - EPISODE = 25000 - - def get_discrete_state(state): - #DISCRETE_OBS_SPACE = [20] * len(state) - high_bound = np.array([1] * len(state)) - low_bound = np.array([-1] * len(state)) - obs_win_size = (high_bound-low_bound) / ([20]* len(state)) - discrete_state = np.subtract(state, low_bound)/ obs_win_size - #print(discrete_state.astype(np.float)) - # we use this tuple to look up the 3 Q values for the available actions in the q-table - return tuple(discrete_state.astype(np.int)) - - # execution loop - obs_n = env.reset() - - #a list of q_tables (one q_table for each agent) - - DISCRETE_OBS_SPACE = [20] * len(obs_n[0]) - q_tables = [] - for i in range(env.n): - q_tables.append(np.random.uniform(low=-3, high=3, size=(DISCRETE_OBS_SPACE + [4]))) - q_tables = np.array(q_tables) - #print(q_tables) - - - #for i in range(EPISODE): do the following - obs_n = env.reset() - while True: - # query for action from each agent's policy - act_n = [] - for i, policy in enumerate(policies): - act_n.append(policy.action(obs_n[i])) - new_discrete_state = get_discrete_state(obs_n[i]) - - print(act_n) - #print(obs_n) - # step environment - obs_n, reward_n, done_n, _ = env.step(act_n) - # render all agent views - env.render() - # display rewards - #for agent in env.world.agents: - # print(agent.name + " reward: %0.3f" % env._get_reward(agent)) - - - if True: - for i, policy in enumerate(policies): - #print(q_tables[tuple([0])+(new_discrete_state,)]) - max_future_q = np.max(q_tables[tuple([i])+new_discrete_state]) - current_q = q_tables[tuple([i])+new_discrete_state] - new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward_n[i] + DISCOUNT * max_future_q) - q_tables[tuple([i])+ new_discrete_state+(act_n[i], )] = new_q - - diff --git a/agents_using_gym/gymMountainCarv0/policy.py b/agents_using_gym/gymMountainCarv0/policy.py deleted file mode 100644 index d8b7ef8ce..000000000 --- a/agents_using_gym/gymMountainCarv0/policy.py +++ /dev/null @@ -1,88 +0,0 @@ -import numpy as np -from pyglet.window import key - -from multiagent.scenarios.simple import Scenario - -# individual agent policy -class Policy(object): - def __init__(self): - self.move = [False for i in range(4)] - def action(self, obs): - #agent = env.agents - raise NotImplementedError() - -# interactive policy based on keyboard input -# hard-coded to deal only with movement, not communication -class InteractivePolicy(Policy): - def __init__(self, env, agent_index): - super(InteractivePolicy, self).__init__() - self.env = env - #self.agent_index = agent_index - # hard-coded keyboard events - self.move = [False for i in range(4)] - self.comm = [False for i in range(env.world.dim_c)] - # register keyboard events with this environment's window - env.viewers[agent_index].window.on_key_press = self.key_press - env.viewers[agent_index].window.on_key_release = self.key_release - - def action(self, obs): - # ignore observation and just act based on keyboard events - - - #x_axis = self.env.agents[self.agent_index].state.p_pos[0] - #y_axis = self.env.agents[self.agent_index].state.p_pos[1] - - ''' - If we try to implement Q-learning in Interactive.action(self, obs), - we may first need to have a get_reward() function for each agent. - - Or a simpler way is to have Interactive.action(self, obs) return the action space - each time. Then implement the Q-learning algorithm in bin/interactive.py since interactive.py have access to everything - and it's more convinient to implement. - ''' - - #obs[2] is the x-axis of the relative position between first landmark and the agent - if obs[2] < 0: - self.move[1] = True - elif obs[2] > 0: - self.move[0] = True - else: - self.move[0] = False - self.move[1] = False - - if obs[3] > 0: - self.move[3] = True - elif obs[3] < 0: - self.move[2] = True - else: - self.move[2] = False - self.move[3] = False - - - if self.env.discrete_action_input: - u = 0 - if self.move[0]: u = 1 - if self.move[1]: u = 2 - if self.move[2]: u = 4 - if self.move[3]: u = 3 - else: - u = np.zeros(5) # 5-d because of no-move action - if self.move[0]: u[1] += 1.0 - if self.move[1]: u[2] += 1.0 - if self.move[3]: u[3] += 1.0 - if self.move[2]: u[4] += 1.0 - if True not in self.move: - u[0] += 1.0 - return np.concatenate([u, np.zeros(self.env.world.dim_c)]) - - # keyboard event callbacks - def key_press(self, k, mod): - if k==key.LEFT: self.move[0] = True - if k==key.RIGHT: self.move[1] = True - if k==key.UP: self.move[2] = True - if k==key.DOWN: self.move[3] = True - def key_release(self, k, mod): - if k==key.LEFT: self.move[0] = False - if k==key.RIGHT: self.move[1] = False - if k==key.UP: self.move[2] = False - if k==key.DOWN: self.move[3] = False diff --git a/agents_using_gym/gymMountainCarv0/simpleqlearning.py b/agents_using_gym/gymMountainCarv0/simpleqlearning.py deleted file mode 100644 index 3e8ccd7d1..000000000 --- a/agents_using_gym/gymMountainCarv0/simpleqlearning.py +++ /dev/null @@ -1,62 +0,0 @@ -import gym -import numpy - -env = gym.make("MountainCar-v0") - -learningrate = 0.7 -discount = 0.90 -#initiallize the Q table [40,40,3] with random values. The meaning of the q table is the q value of a set of [state of positions,state of velocity, action you take]. -#Note that the game is continous but the states of our q table are discrete(since we can only deal with finite states), So I also need a getstate function to turn the continous states into deiscrete states. -#all q values are initialized between -2 and 0 because the reward is always -1 in the mountaincar game. -q_table = numpy.random.uniform(-2, 0, [40,40,3]) - - -def getstate(state): - discrete_state = (state - env.observation_space.low)/((env.observation_space.high-env.observation_space.low)/[40,40]) - return tuple(discrete_state.astype(numpy.int)) # we use this tuple to look up the 3 Q values for the available actions in the q-table - - -for episode in range(2700): - currentstate = getstate(env.reset()) - done = False - #render every 300 episodes to save time. - if episode % 300 == 0: - render = True - print(episode) - else: - render = False - - while not done: - action = numpy.argmax(q_table[currentstate]) - new_state, reward, done,info = env.step(action) - #nextstate is the discrete mapping from the new state to the q table - nextstate = getstate(new_state) - - if render: - env.render() - - # Update Q table - if not done: - # Maximum possible Q value in next step (for new state) - maxnextq = numpy.max(q_table[nextstate]) - # Current Q value (for current state and performed action) - current_q = q_table[currentstate + (action,)] - # the qlearning function - new_q = (1 - learningrate) * current_q + learningrate * (reward + discount * maxnextq) - # Update Q table with new Q value - q_table[currentstate + (action,)] = new_q - - - # Simulation ended (for any reson) - if goal position is achived - update Q value with reward directly - elif new_state[0] >= 0.5: - print("We make it!") - print(episode) - q_table[currentstate + (action,)] = 0 - - - currentstate = nextstate - - - - -env.close() \ No newline at end of file diff --git a/changes.txt b/changes.txt deleted file mode 100644 index 95529c034..000000000 --- a/changes.txt +++ /dev/null @@ -1,23 +0,0 @@ -multi_discrete.py: -- [FIXED] Changed random_array assignment in sample() to use gym.utils.seeding - -rendering.py: -- [FIXED] The reraise function appears to no longer exist. Fortunately, all this does is add additional information to a raised - exception, so should be simple to reimplement (or ignore) - -environment.py: -- [FIXED] line 234: `geom.set_color(*entity.color, alpha=0.5)` - receives multiple arguments. This is a pain, because it is - expanding the color argument, a 3-tuple or 4-tuple, but we want to set alpha to 0.5. A dumb fix is to make a new - tuple with the first three arguments of the color, and 0.5 for alpha. - -simple_crypto.py: -- line 121: array in conditional can potentially be a boolean rather than an ndarray so it will have no .all() method - - Similar statements appear on lines 104, 109 -- line 122: the sizes in the expression `agent.state.c - agent.goal_a.color` are mismatched and it is unclear where they - come from - - Similar statements appear on lines 107, 112 -- [Austen] I got it to run by removing the .all() calls and changing lines like - np.sum(np.square(agent.state.c - agent.goal_a.color)) - to - np.square(len(agent.state.c)-len(agent.goal_a.color)) - Not sure if scenario still functions correctly / if reward calculations are accurate diff --git a/documentation.md b/documentation.md deleted file mode 100644 index f7e697ccf..000000000 --- a/documentation.md +++ /dev/null @@ -1,77 +0,0 @@ -# Execution: - -In a simulation with `n` agents: - -1. bin/script.py loads - acts as main script -2. Loads scenario - - `./multiagent/scenario.py.make_world()` -3. Loads multi-agent enviroment given scenario settings and world - - `./multiagent/environment.py.MultiAgentEnv(Scenario.world())` -4. Renders environment (initial render) - - `./multiagent/environment.py.render()` -5. Assigns policies (algorithms) for each agent - - stored as policies[] list - - policy[agent_index] = ./multiagent/policies/template.py.TemplatePolicy(env,agent_index) - - Note: Template not implemented yet, see `./multiagent/policy.py.InteractivePolicy()` for now - - For more information, see [Policies](#POLICIES) -6. Resets environment -7. Infinite while loop - 1. Makes a list of actions, one action per policy - - actions[i] - 2. Performs one environment step using entire action list - - `multiagent/environment.py.step()` returns: - - n observations - - n rewards - - n done states - - n debug objects - 3. Re-render - - `multiagent/environment.py.render()` - -## Environment - -The main class in use during execution. The environment interacts with the scenario and the agents. There is one environment that all scenarios use. Each scenario implements reward() and observation() which the environment calls. - -- `./make_env.py`: contains code for importing a multiagent environment as an OpenAI Gym-like object. - -- `./multiagent/environment.py`: contains code for environment simulation (interaction physics, `_step()` function, etc.) - -## Policy - -A policy seems to be a system to control an agent. The interactive policy allows control of an agent with keyboard and mouse, but if we wish to implement algorithms we will most likely be implementing them as a policy. **NOTE: Policies are enumerable** - -- `./multiagent/policy.py`: contains code for interactive policy based on keyboard input. - -A Policy has two functions: - -- `__init__()` passes the environment to the policy class -- `action(obs)` performs an action given an observation - - -## Scenarios - -A BaseScenario `multiagent/scenario.py` incorporates at least `make_world()` and `reset_world()`. An implemented Scenario will incorporate reward() and observation(). All scenario calls are made through the environment. - -- `./multiagent/scenario.py`: contains base scenario object that is extended for all scenarios. - -- `./multiagent/scenarios/`: folder where various scenarios/ environments are stored. scenario code consists of several functions: - 1) `make_world()`: creates all of the entities that inhabit the world (landmarks, agents, etc.), assigns their capabilities (whether they can communicate, or move, or both). - called once at the beginning of each training session - 2) `reset_world()`: resets the world by assigning properties (position, color, etc.) to all entities in the world - called before every episode (including after make_world() before the first episode) - 3) `reward(agent,world)`: defines the reward function for a given agent - 4) `observation(agent, world)`: defines the observation space of a given agent - 5) (optional) `benchmark_data()`: provides diagnostic data for policies trained on the environment (e.g. evaluation metrics) - -You can create new scenarios by implementing the first 4 functions above (`make_world()`, `reset_world()`, `reward()`, and `observation()`), and have to keep the same function signature(can't not change parameters), unless we all make changes to multiagent/environment. - -## Miscellaneous - -- `./multiagent/core.py`: contains classes for various objects (Entities, Landmarks, Agents, etc.) that are used throughout the code.(used for creating a scenario. We might need customized entities, agents for our own scenarios.) - -- `./multiagent/rendering.py`: used for displaying agent behaviors on the screen. - -## Visualization: - -1. Each agent will have one corresponding window generated for itself, agents always locate at the center of the camera in its own wondow. -2. In the interactive policy, pressing -> will make the agent go left in the world, but everything else goes right in its own window(since it's always at the center of its own window). - diff --git a/multiagent/scenarios/cus.py b/multiagent/scenarios/cus.py deleted file mode 100644 index c32da49e2..000000000 --- a/multiagent/scenarios/cus.py +++ /dev/null @@ -1,64 +0,0 @@ -import numpy as np -from multiagent.core import World, Agent, Landmark -from multiagent.scenario import BaseScenario - -class Scenario(BaseScenario): - def __init__(self): - super(Scenario, self).__init__() - self.agentsToLandMarks = {} - - def make_world(self): - world = World() - # add agents - numberOfAgents = 2; - world.agents = [Agent() for i in range(numberOfAgents)] - for i, agent in enumerate(world.agents): - agent.name = 'agent %d' % i - agent.collide = False - agent.silent = True - # add landmarks - world.landmarks = [Landmark() for i in range(numberOfAgents)] - for i, landmark in enumerate(world.landmarks): - landmark.name = 'landmark %d' % i - landmark.collide = False - landmark.movable = False - #fill in the dictionary - for i in range(numberOfAgents): - self.agentsToLandMarks.update({ world.agents[i]: world.landmarks[i] }) - - # make initial conditions - self.reset_world(world) - return world - - def reset_world(self, world): - # random properties for agents - for i, agent in enumerate(world.agents): - agent.color = np.array([0.25,0.25,0.25]) - # random properties for landmarks - for i, landmark in enumerate(world.landmarks): - landmark.color = np.array([0.75,0.25,0.25]) - world.landmarks[0].color = np.array([0.75,0.25,0.25]) - # set random initial states - for i,agent in enumerate(world.agents): - # agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) - agent.state.p_pos = np.array([i/2,0]) - agent.state.p_vel = np.zeros(world.dim_p) - agent.state.c = np.zeros(world.dim_c) - for i, landmark in enumerate(world.landmarks): - # landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) - landmark.state.p_pos = np.array([i/2,0.75]) - landmark.state.p_vel = np.zeros(world.dim_p) - - def reward(self, agent, world): - # dist2 = np.sum(np.square(agent.state.p_pos - world.landmarks[0].state.p_pos)) - # dist2 = world.landmarks[0].state.p_pos - delta_pos = agent.state.p_pos - self.agentsToLandMarks[agent].state.p_pos - dist = np.sqrt(np.sum(np.square(delta_pos))) - return -dist - - def observation(self, agent, world): - # get positions of all entities in this agent's reference frame - entity_pos = [] - for entity in world.landmarks: - entity_pos.append(entity.state.p_pos - agent.state.p_pos) - return np.concatenate([agent.state.p_vel] + entity_pos) diff --git a/multiagent/scenarios/race.py b/multiagent/scenarios/race.py deleted file mode 100644 index 2db68db46..000000000 --- a/multiagent/scenarios/race.py +++ /dev/null @@ -1,70 +0,0 @@ -import numpy as np -from multiagent.core import World, Agent, Landmark -from multiagent.scenario import BaseScenario - -class Scenario(BaseScenario): - def make_world(self): - world = World() - # add agents - world.agents = [Agent() for i in range(2)] - for i, agent in enumerate(world.agents): - agent.name = 'agent %d' % i - agent.collide = False - agent.silent = True - # add landmarks - world.landmarks = [Landmark() for i in range(2)] - for i, landmark in enumerate(world.landmarks): - landmark.name = 'landmark %d' % i - landmark.collide = False - landmark.movable = False - # make initial conditions - self.reset_world(world) - return world - - def reset_world(self, world): - # random properties for agents - for i, agent in enumerate(world.agents): - agent.color = np.array([0.25,0.25,0.25]) - # random properties for landmarks - for i, landmark in enumerate(world.landmarks): - landmark.color = np.array([0.75,0.75,0.75]) - world.landmarks[0].color = np.array([0.75,0.25,0.25]) - world.landmarks[1].color = np.array([0.75,0.25,0.25]) - # set random initial states - #for agent in world.agents: - #agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) - world.agents[0].state.p_pos = np.array([0.0,0.0]) - world.agents[0].state.p_vel = np.zeros(world.dim_p) - world.agents[0].state.c = np.zeros(world.dim_c) - - world.agents[1].state.p_pos = np.array([0.5,0.0]) - world.agents[1].state.p_vel = np.zeros(world.dim_p) - world.agents[1].state.c = np.zeros(world.dim_c) - - for i, landmark in enumerate(world.landmarks): - #landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) - landmark.state.p_pos = np.array([0.0 + i*0.5, 5.0]) - landmark.state.p_vel = np.zeros(world.dim_p) - - def reward(self, agent, world): - # dist2 = np.sum(np.square(agent.state.p_pos - np.array([agent.state.p_pos[0], 5.0]))) - agentCheated = False - theOtherAgentCheated = False - if agentCheated and theOtherAgentCheated: - return 1 - if agentCheated and !theOtherAgentCheated: - return 5 - if !agentCheated and theOtherAgentCheated: - return -3 - else: - return 3 - # if !agentCheated and !theOtherAgentCheated: - # return 3 - # return -dist2 - - def observation(self, agent, world): - # get positions of all entities in this agent's reference frame - entity_pos = [] - for entity in world.landmarks: - entity_pos.append(entity.state.p_pos - agent.state.p_pos) - return np.concatenate([agent.state.p_vel] + entity_pos) diff --git a/multiagent/scenarios/testing.py b/multiagent/scenarios/testing.py deleted file mode 100644 index 51c2d4f20..000000000 --- a/multiagent/scenarios/testing.py +++ /dev/null @@ -1,145 +0,0 @@ -import numpy as np -from multiagent.core import World, Agent, Landmark -from multiagent.scenario import BaseScenario - - -class Scenario(BaseScenario): - goalDist = 5.0; #Currently the distance to landmark - - def make_world(self): - world = World() #World has agents and landmarks - # set any world properties first - world.dim_c = 0 - num_agents = 2 #Change this to add agents - world.num_agents = num_agents - num_adversaries = 0 - num_landmarks = num_agents - # add agents - world.agents = [Agent() for i in range(num_agents)] - for i, agent in enumerate(world.agents): - agent.name = 'agent %d' % i - agent.collide = False - agent.silent = True - agent.adversary = True if i < num_adversaries else False - agent.size = 0.15 - # add landmarks - world.landmarks = [Landmark() for i in range(num_landmarks)] - for i, landmark in enumerate(world.landmarks): - landmark.name = 'landmark %d' % i - landmark.collide = False - landmark.movable = False - landmark.size = 0.08 - # make initial conditions - self.reset_world(world) - return world - - def reset_world(self, world): - # random properties for agents - world.agents[0].color = np.array([0.85, 0.35, 0.35]) - for i in range(1, world.num_agents): - world.agents[i].color = np.array([0.35, 0.35, 0.85]) - # random properties for landmarks - for i, landmark in enumerate(world.landmarks): - landmark.color = np.array([0.15, 0.15, 0.15]) - for agent in world.agents: - agent.goal_a = goal - # set random initial states TODO: Initialize agents + landmarks to set positions with 0 velocity - for i, agent in enumerate(world.agents): - agent.state.p_pos = np.array([i/2,0]) - agent.state.p_vel = 0 - agent.state.c = np.zeros(world.dim_c) - for i, landmark in enumerate(world.landmarks): - landmark.state.p_pos = np.array([i,goalDist]) - landmark.state.p_vel = 0 - - def benchmark_data(self, agent, world): - # returns data for benchmarking purposes - if agent.adversary: - return np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) - else: - dists = [] - for l in world.landmarks: - dists.append(np.sum(np.square(agent.state.p_pos - l.state.p_pos))) - dists.append(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) - return tuple(dists) - - # return all agents that are not adversaries - def good_agents(self, world): - return [agent for agent in world.agents if not agent.adversary] - - # # return all adversarial agents - # def adversaries(self, world): - # return [agent for agent in world.agents if agent.adversary] - - #Simplified to just distance from y = 5; - def reward(self, agent, world): - alpha = 0.5 #Can be adjusted to determine whether individual performance, or ranked importance is more important [0,1] - return alpha * agent.state.p_pos[1] - (1-alpha) * 1/(world.num_agents-1)*sum([other.state.p_pos[1] for other in world.agents if other is not agent]) - #Right now + for distance - average of the distance covered by other agents. - - - # return self.agent_reward(agent,world) - # Agents are rewarded based on minimum agent distance to each landmark - # return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) - - # def agent_reward(self, agent, world): #TODO: set reward to distance to their landmark, remove adversary stuff - # # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it - # shaped_reward = True - # shaped_adv_reward = True - - # # # Calculate negative reward for adversary - # # adversary_agents = self.adversaries(world) - # # if shaped_adv_reward: # distance-based adversary reward - # # adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents]) - # # else: # proximity-based adversary reward (binary) - # # adv_rew = 0 - # # for a in adversary_agents: - # # if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size: - # # adv_rew -= 5 - - # # Calculate positive reward for agents - # good_agents = self.good_agents(world) - # if shaped_reward: # distance-based agent reward - # pos_rew = -min( - # [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) - # else: # proximity-based agent reward (binary) - # pos_rew = 0 - # if min([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) \ - # < 2 * agent.goal_a.size: - # pos_rew += 5 - # pos_rew -= min( - # [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) - # return pos_rew + adv_rew #Rewards are a simple int - - #Adversaries are given rewards - # def adversary_reward(self, agent, world): - # # Rewarded based on proximity to the goal landmark - # shaped_reward = True - # if shaped_reward: # distance-based reward - # return -np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) - # else: # proximity-based reward (binary) - # adv_rew = 0 - # if np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) < 2 * agent.goal_a.size: - # adv_rew += 5 - # return adv_rew - - - #What is passed to the agent ie How they see the world - def observation(self, agent, world): - # get positions of all entities in this agent's reference frame - # entity_pos = [] - # for entity in world.landmarks: - # entity_pos.append(entity.state.p_pos - agent.state.p_pos) - entity_pos = [goalDist - agent.state.p_pos[1]] #Should only need the distance to it's own landmark goal - - - # communication of all other Agents - other_pos = [] - for other in world.agents: - # if other is agent: continue - other_pos.append(goalDist - other.state.p_pos[1]) #Agents know how far other agents are from their goals - - if not agent.adversary: - return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + entity_pos + other_pos) - else: - return np.concatenate(entity_pos + other_pos) From b82e19b2806f3396a6f9c769446ca6ef71f99917 Mon Sep 17 00:00:00 2001 From: jarbus Date: Thu, 25 Jul 2019 22:12:49 -0400 Subject: [PATCH 55/56] push cleanup --- multiagent/policy.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/multiagent/policy.py b/multiagent/policy.py index ff2e8997a..fd94e6ac5 100644 --- a/multiagent/policy.py +++ b/multiagent/policy.py @@ -26,28 +26,6 @@ def __init__(self, env, agent_index): def action(self, obs): # ignore observation and just act based on keyboard events - - - #x_axis = self.env.agents[self.agent_index].state.p_pos[0] - #y_axis = self.env.agents[self.agent_index].state.p_pos[1] - - # if obs[2] < 0: - # self.move[1] = True - # elif obs[2] > 0: - # self.move[0] = True - # else: - # self.move[0] = False - # self.move[1] = False - - # if obs[3] > 0: - # self.move[3] = True - # elif obs[3] < 0: - # self.move[2] = True - # else: - # self.move[2] = False - # self.move[3] = False - - if self.env.discrete_action_input: u = 0 if self.move[0]: u = 1 From 6ec57e79add813d36b345050d9512a87250d763b Mon Sep 17 00:00:00 2001 From: jarbus Date: Thu, 25 Jul 2019 22:17:33 -0400 Subject: [PATCH 56/56] push cleanup --- multiagent/policy.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/multiagent/policy.py b/multiagent/policy.py index fd94e6ac5..9c9cc783b 100644 --- a/multiagent/policy.py +++ b/multiagent/policy.py @@ -34,12 +34,12 @@ def action(self, obs): if self.move[3]: u = 3 else: u = np.zeros(5) # 5-d because of no-move action - if self.move[0]: u[1] += 0.01 - if self.move[1]: u[2] += 0.01 - if self.move[3]: u[3] += 0.01 - if self.move[2]: u[4] += 0.01 + if self.move[0]: u[1] += 1.0 + if self.move[1]: u[2] += 1.0 + if self.move[3]: u[3] += 1.0 + if self.move[2]: u[4] += 1.0 if True not in self.move: - u[0] += 0.01 + u[0] += 1.0 return np.concatenate([u, np.zeros(self.env.world.dim_c)]) # keyboard event callbacks