xuehy
diff --git a/‎3.png
166 KB b/‎3.png
166 KB
diff --git a/‎4.png
89.9 KB b/‎4.png
89.9 KB
diff --git a/‎MADDPG.py
+163 b/‎MADDPG.py
+163
diff --git a/‎README.org
+46 b/‎README.org
+46
diff --git a/‎__pycache__/MADDPG.cpython-36.pyc
5.33 KB b/‎__pycache__/MADDPG.cpython-36.pyc
5.33 KB
diff --git a/‎__pycache__/memory.cpython-36.pyc
1.08 KB b/‎__pycache__/memory.cpython-36.pyc
1.08 KB
diff --git a/‎__pycache__/model.cpython-36.pyc
1.57 KB b/‎__pycache__/model.cpython-36.pyc
1.57 KB
diff --git a/‎__pycache__/params.cpython-36.pyc
141 Bytes b/‎__pycache__/params.cpython-36.pyc
141 Bytes
diff --git a/‎__pycache__/randomProcess.cpython-36.pyc
2.11 KB b/‎__pycache__/randomProcess.cpython-36.pyc
2.11 KB
diff --git a/‎main.py
+130 b/‎main.py
+130
diff --git a/‎memory.py
+23 b/‎memory.py
+23
@@ -0,0 +1,163 @@
+from model import Critic, Actor
+import torch as th
+from copy import deepcopy
+from memory import ReplayMemory, Experience
+from torch.optim import Adam
+from randomProcess import OrnsteinUhlenbeckProcess
+from torch.autograd import Variable
+import torch.nn as nn
+import numpy as np
+from params import scale_reward
+
+
+def soft_update(target, source, t):
+    for target_param, source_param in zip(target.parameters(),
+                                          source.parameters()):
+        target_param.data.copy_(
+            (1 - t) * target_param.data + t * source_param.data)
+
+
+def hard_update(target, source):
+    for target_param, source_param in zip(target.parameters(),
+                                          source.parameters()):
+        target_param.data.copy_(source_param.data)
+
+
+class MADDPG:
+    def __init__(self, n_agents, dim_obs, dim_act, batch_size,
+                 capacity, episodes_before_train):
+        self.actors = [Actor(dim_obs, dim_act) for i in range(n_agents)]
+        self.critics = [Critic(n_agents, dim_obs,
+                               dim_act) for i in range(n_agents)]
+        self.actors_target = deepcopy(self.actors)
+        self.critics_target = deepcopy(self.critics)
+
+        self.n_agents = n_agents
+        self.n_states = dim_obs
+        self.n_actions = dim_act
+        self.memory = ReplayMemory(capacity)
+        self.batch_size = batch_size
+        self.use_cuda = th.cuda.is_available()
+        self.episodes_before_train = episodes_before_train
+
+        self.GAMMA = 0.95
+        self.tau = 0.01
+
+        self.var = [1.0 for i in range(n_agents)]
+        self.critic_optimizer = [Adam(x.parameters(),
+                                      lr=0.001) for x in self.critics]
+        self.actor_optimizer = [Adam(x.parameters(),
+                                     lr=0.0001) for x in self.actors]
+
+        if self.use_cuda:
+            for x in self.actors:
+                x.cuda()
+            for x in self.critics:
+                x.cuda()
+            for x in self.actors_target:
+                x.cuda()
+            for x in self.critics_target:
+                x.cuda()
+
+        self.steps_done = 0
+        self.episode_done = 0
+
+    def update_policy(self):
+        # do not train until exploration is enough
+        if self.episode_done <= self.episodes_before_train:
+            return None, None
+
+        ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor
+        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor
+
+        c_loss = []
+        a_loss = []
+        for agent in range(self.n_agents):
+            transitions = self.memory.sample(self.batch_size)
+            batch = Experience(*zip(*transitions))
+            non_final_mask = ByteTensor(list(map(lambda s: s is not None,
+                                                 batch.next_states)))
+            # state_batch: batch_size x n_agents x dim_obs
+            state_batch = Variable(th.stack(batch.states).type(FloatTensor))
+            action_batch = Variable(th.stack(batch.actions).type(FloatTensor))
+            reward_batch = Variable(th.stack(batch.rewards).type(FloatTensor))
+            # : (batch_size_non_final) x n_agents x dim_obs
+            non_final_next_states = Variable(th.stack(
+                [s for s in batch.next_states
+                 if s is not None]).type(FloatTensor))
+
+            # for current agent
+            whole_state = state_batch.view(self.batch_size, -1)
+            whole_action = action_batch.view(self.batch_size, -1)
+            self.critic_optimizer[agent].zero_grad()
+            current_Q = self.critics[agent](whole_state, whole_action)
+
+            non_final_next_actions = [
+                self.actors_target[i](non_final_next_states[:,
+                                                            i,
+                                                            :]) for i in range(
+                                                                self.n_agents)]
+            non_final_next_actions = th.stack(non_final_next_actions)
+#            non_final_next_actions = Variable(non_final_next_actions)
+            non_final_next_actions = (
+                non_final_next_actions.transpose(0,
+                                                 1).contiguous())
+
+            target_Q = Variable(th.zeros(
+                self.batch_size).type(FloatTensor))
+            target_Q[non_final_mask] = self.critics_target[agent](
+                non_final_next_states.view(-1, self.n_agents * self.n_states),
+                non_final_next_actions.view(-1,
+                                            self.n_agents * self.n_actions))
+
+            # scale_reward: to scale reward in Q functions
+            target_Q = (target_Q * self.GAMMA) + (
+                reward_batch[:, agent] * scale_reward)
+
+            loss_Q = nn.MSELoss()(current_Q, target_Q.detach())
+            loss_Q.backward()
+            self.critic_optimizer[agent].step()
+
+            self.actor_optimizer[agent].zero_grad()
+            state_i = state_batch[:, agent, :]
+            action_i = self.actors[agent](state_i)
+            ac = action_batch.clone()
+            ac[:, agent, :] = action_i
+            whole_action = ac.view(self.batch_size, -1)
+            actor_loss = -self.critics[agent](whole_state, whole_action)
+            actor_loss = actor_loss.mean()
+            actor_loss.backward()
+            self.actor_optimizer[agent].step()
+            c_loss.append(loss_Q)
+            a_loss.append(actor_loss)
+
+        if self.steps_done % 100 == 0 and self.steps_done > 0:
+            for i in range(self.n_agents):
+                soft_update(self.critics_target[i], self.critics[i], self.tau)
+                soft_update(self.actors_target[i], self.actors[i], self.tau)
+
+        return c_loss, a_loss
+
+    def select_action(self, state_batch):
+        # state_batch: n_agents x state_dim
+        actions = Variable(th.zeros(
+            self.n_agents,
+            self.n_actions))
+        FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor
+        for i in range(self.n_agents):
+            sb = state_batch[i, :].detach()
+            act = self.actors[i](sb.unsqueeze(0)).squeeze()
+
+            act += Variable(
+                th.from_numpy(
+                    np.random.randn(2) * self.var[i]).type(FloatTensor))
+
+            if self.episode_done > self.episodes_before_train and\
+               self.var[i] > 0.05:
+                self.var[i] *= 0.999998
+            act = th.clamp(act, -1.0, 1.0)
+
+            actions[i, :] = act
+        self.steps_done += 1
+
+        return actions
@@ -0,0 +1,46 @@
+#+TITLE: An implementation of MADDPG
+#+AUTHOR: xuehy
+#+EMAIL: [email protected]
+#+STARTUP: content
+
+* 1. Introduction
+
+This is an pytorch implementation of multi-agent deep deterministic policy gradient algorithm.
+
+The experimental environment is a modified version of Waterworld based on [[https://github.com/sisl/MADRL][MADRL]]. 
+
+* 2. Environment
+
+The main features (different from MADRL) of the modified Waterworld environment are:
+
+- evaders and poisons now bounce at the wall obeying physical rules
+- sizes of the evaders, pursuers and poisons are now the same
+
+* 3. Dependency
+
+- [[https://github.com/pytorch/pytorch][pytorch]]
+- [[https://github.com/facebookresearch/visdom][visdom]]
+- `python==3.6.1' (recommend using the anaconda/miniconda)
+- if you need to render the environments, `opencv' is required
+
+* 4. Install
+
+- Install [[https://github.com/sisl/MADRL][MADRL]].
+- Replace the `madrl_environments/pursuit' directory with the one in this repo.
+- `python main.py' 
+
+if scene rendering is enabled, recommend to install `opencv' through [[https://github.com/conda-forge/opencv-feedstock][conda-forge]].
+
+* 5. Results
+
+** two agents, cooperation = 2
+#+ATTR_HTML style="center"
+[[3.png]]
+
+the average
+
+[[4.png]]
+
+
+
+
@@ -0,0 +1,130 @@
+from torch.autograd import Variable
+from madrl_environments.pursuit import MAWaterWorld_mod
+from MADDPG import MADDPG
+import numpy as np
+import torch as th
+import visdom
+from params import scale_reward
+
+# do not render the scene
+e_render = False
+
+food_reward = 10.
+poison_reward = -1.
+encounter_reward = 0.01
+n_coop = 2
+world = MAWaterWorld_mod(n_pursuers=2, n_evaders=50,
+                         n_poison=50, obstacle_radius=0.04,
+                         food_reward=food_reward,
+                         poison_reward=poison_reward,
+                         encounter_reward=encounter_reward,
+                         n_coop=n_coop,
+                         sensor_range=0.2, obstacle_loc=None, )
+
+vis = visdom.Visdom(port=5274)
+reward_record = []
+
+np.random.seed(1234)
+th.manual_seed(1234)
+world.seed(1234)
+n_agents = world.n_pursuers
+n_states = 213
+n_actions = 2
+capacity = 1000000
+batch_size = 1000
+
+n_episode = 20000
+max_steps = 1000
+episodes_before_train = 100
+
+win = None
+param = None
+
+maddpg = MADDPG(n_agents, n_states, n_actions, batch_size, capacity,
+                episodes_before_train)
+
+FloatTensor = th.cuda.FloatTensor if maddpg.use_cuda else th.FloatTensor
+for i_episode in range(n_episode):
+    obs = world.reset()
+    obs = np.stack(obs)
+    if isinstance(obs, np.ndarray):
+        obs = th.from_numpy(obs).float()
+    total_reward = 0.0
+    rr = np.zeros((n_agents,))
+    for t in range(max_steps):
+        # render every 100 episodes to speed up training
+        if i_episode % 100 == 0 and e_render:
+            world.render()
+        obs = Variable(obs).type(FloatTensor)
+        action = maddpg.select_action(obs).data.cpu()
+        obs_, reward, done, _ = world.step(action.numpy())
+
+        reward = th.FloatTensor(reward).type(FloatTensor)
+        obs_ = np.stack(obs_)
+        obs_ = th.from_numpy(obs_).float()
+        if t != max_steps - 1:
+            next_obs = obs_
+        else:
+            next_obs = None
+
+        total_reward += reward.sum()
+        rr += reward.cpu().numpy()
+        maddpg.memory.push(obs.data, action, next_obs, reward)
+        obs = next_obs
+
+        c_loss, a_loss = maddpg.update_policy()
+    maddpg.episode_done += 1
+    print('Episode: %d, reward = %f' % (i_episode, total_reward))
+    reward_record.append(total_reward)
+
+    if maddpg.episode_done == maddpg.episodes_before_train:
+        print('training now begins...')
+        print('MADDPG on WaterWorld\n' +
+              'scale_reward=%f\n' % scale_reward +
+              'agent=%d' % n_agents +
+              ', coop=%d' % n_coop +
+              ' \nlr=0.001, 0.0001, sensor_range=0.3\n' +
+              'food=%f, poison=%f, encounter=%f' % (
+                  food_reward,
+                  poison_reward,
+                  encounter_reward))
+
+    if win is None:
+        win = vis.line(X=np.arange(i_episode, i_episode+1),
+                       Y=np.array([
+                           np.append(total_reward, rr)]),
+                       opts=dict(
+                           ylabel='Reward',
+                           xlabel='Episode',
+                           title='MADDPG on WaterWorld_mod\n' +
+                           'agent=%d' % n_agents +
+                           ', coop=%d' % n_coop +
+                           ', sensor_range=0.2\n' +
+                           'food=%f, poison=%f, encounter=%f' % (
+                               food_reward,
+                               poison_reward,
+                               encounter_reward),
+                           legend=['Total'] +
+                           ['Agent-%d' % i for i in range(n_agents)]))
+    else:
+        vis.line(X=np.array(
+            [np.array(i_episode).repeat(n_agents+1)]),
+                 Y=np.array([np.append(total_reward,
+                                       rr)]),
+                 win=win,
+                 update='append')
+    if param is None:
+        param = vis.line(X=np.arange(i_episode, i_episode+1),
+                         Y=np.array([maddpg.var[0]]),
+                         opts=dict(
+                             ylabel='Var',
+                             xlabel='Episode',
+                             title='MADDPG on WaterWorld: Exploration',
+                             legend=['Variance']))
+    else:
+        vis.line(X=np.array([i_episode]),
+                 Y=np.array([maddpg.var[0]]),
+                 win=param,
+                 update='append')
+
+world.close()
@@ -0,0 +1,23 @@
+from collections import namedtuple
+import random
+Experience = namedtuple('Experience',
+                        ('states', 'actions', 'next_states', 'rewards'))
+
+
+class ReplayMemory:
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.memory = []
+        self.position = 0
+
+    def push(self, *args):
+        if len(self.memory) < self.capacity:
+            self.memory.append(None)
+        self.memory[self.position] = Experience(*args)
+        self.position = (self.position + 1) % self.capacity
+
+    def sample(self, batch_size):
+        return random.sample(self.memory, batch_size)
+
+    def __len__(self):
+        return len(self.memory)