Skip to content

Commit ca210c4

Browse files
committed
first commit
0 parents  commit ca210c4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+3888
-0
lines changed

3.png

166 KB
Loading

4.png

89.9 KB
Loading

MADDPG.py

+163
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
from model import Critic, Actor
2+
import torch as th
3+
from copy import deepcopy
4+
from memory import ReplayMemory, Experience
5+
from torch.optim import Adam
6+
from randomProcess import OrnsteinUhlenbeckProcess
7+
from torch.autograd import Variable
8+
import torch.nn as nn
9+
import numpy as np
10+
from params import scale_reward
11+
12+
13+
def soft_update(target, source, t):
14+
for target_param, source_param in zip(target.parameters(),
15+
source.parameters()):
16+
target_param.data.copy_(
17+
(1 - t) * target_param.data + t * source_param.data)
18+
19+
20+
def hard_update(target, source):
21+
for target_param, source_param in zip(target.parameters(),
22+
source.parameters()):
23+
target_param.data.copy_(source_param.data)
24+
25+
26+
class MADDPG:
27+
def __init__(self, n_agents, dim_obs, dim_act, batch_size,
28+
capacity, episodes_before_train):
29+
self.actors = [Actor(dim_obs, dim_act) for i in range(n_agents)]
30+
self.critics = [Critic(n_agents, dim_obs,
31+
dim_act) for i in range(n_agents)]
32+
self.actors_target = deepcopy(self.actors)
33+
self.critics_target = deepcopy(self.critics)
34+
35+
self.n_agents = n_agents
36+
self.n_states = dim_obs
37+
self.n_actions = dim_act
38+
self.memory = ReplayMemory(capacity)
39+
self.batch_size = batch_size
40+
self.use_cuda = th.cuda.is_available()
41+
self.episodes_before_train = episodes_before_train
42+
43+
self.GAMMA = 0.95
44+
self.tau = 0.01
45+
46+
self.var = [1.0 for i in range(n_agents)]
47+
self.critic_optimizer = [Adam(x.parameters(),
48+
lr=0.001) for x in self.critics]
49+
self.actor_optimizer = [Adam(x.parameters(),
50+
lr=0.0001) for x in self.actors]
51+
52+
if self.use_cuda:
53+
for x in self.actors:
54+
x.cuda()
55+
for x in self.critics:
56+
x.cuda()
57+
for x in self.actors_target:
58+
x.cuda()
59+
for x in self.critics_target:
60+
x.cuda()
61+
62+
self.steps_done = 0
63+
self.episode_done = 0
64+
65+
def update_policy(self):
66+
# do not train until exploration is enough
67+
if self.episode_done <= self.episodes_before_train:
68+
return None, None
69+
70+
ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor
71+
FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor
72+
73+
c_loss = []
74+
a_loss = []
75+
for agent in range(self.n_agents):
76+
transitions = self.memory.sample(self.batch_size)
77+
batch = Experience(*zip(*transitions))
78+
non_final_mask = ByteTensor(list(map(lambda s: s is not None,
79+
batch.next_states)))
80+
# state_batch: batch_size x n_agents x dim_obs
81+
state_batch = Variable(th.stack(batch.states).type(FloatTensor))
82+
action_batch = Variable(th.stack(batch.actions).type(FloatTensor))
83+
reward_batch = Variable(th.stack(batch.rewards).type(FloatTensor))
84+
# : (batch_size_non_final) x n_agents x dim_obs
85+
non_final_next_states = Variable(th.stack(
86+
[s for s in batch.next_states
87+
if s is not None]).type(FloatTensor))
88+
89+
# for current agent
90+
whole_state = state_batch.view(self.batch_size, -1)
91+
whole_action = action_batch.view(self.batch_size, -1)
92+
self.critic_optimizer[agent].zero_grad()
93+
current_Q = self.critics[agent](whole_state, whole_action)
94+
95+
non_final_next_actions = [
96+
self.actors_target[i](non_final_next_states[:,
97+
i,
98+
:]) for i in range(
99+
self.n_agents)]
100+
non_final_next_actions = th.stack(non_final_next_actions)
101+
# non_final_next_actions = Variable(non_final_next_actions)
102+
non_final_next_actions = (
103+
non_final_next_actions.transpose(0,
104+
1).contiguous())
105+
106+
target_Q = Variable(th.zeros(
107+
self.batch_size).type(FloatTensor))
108+
target_Q[non_final_mask] = self.critics_target[agent](
109+
non_final_next_states.view(-1, self.n_agents * self.n_states),
110+
non_final_next_actions.view(-1,
111+
self.n_agents * self.n_actions))
112+
113+
# scale_reward: to scale reward in Q functions
114+
target_Q = (target_Q * self.GAMMA) + (
115+
reward_batch[:, agent] * scale_reward)
116+
117+
loss_Q = nn.MSELoss()(current_Q, target_Q.detach())
118+
loss_Q.backward()
119+
self.critic_optimizer[agent].step()
120+
121+
self.actor_optimizer[agent].zero_grad()
122+
state_i = state_batch[:, agent, :]
123+
action_i = self.actors[agent](state_i)
124+
ac = action_batch.clone()
125+
ac[:, agent, :] = action_i
126+
whole_action = ac.view(self.batch_size, -1)
127+
actor_loss = -self.critics[agent](whole_state, whole_action)
128+
actor_loss = actor_loss.mean()
129+
actor_loss.backward()
130+
self.actor_optimizer[agent].step()
131+
c_loss.append(loss_Q)
132+
a_loss.append(actor_loss)
133+
134+
if self.steps_done % 100 == 0 and self.steps_done > 0:
135+
for i in range(self.n_agents):
136+
soft_update(self.critics_target[i], self.critics[i], self.tau)
137+
soft_update(self.actors_target[i], self.actors[i], self.tau)
138+
139+
return c_loss, a_loss
140+
141+
def select_action(self, state_batch):
142+
# state_batch: n_agents x state_dim
143+
actions = Variable(th.zeros(
144+
self.n_agents,
145+
self.n_actions))
146+
FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor
147+
for i in range(self.n_agents):
148+
sb = state_batch[i, :].detach()
149+
act = self.actors[i](sb.unsqueeze(0)).squeeze()
150+
151+
act += Variable(
152+
th.from_numpy(
153+
np.random.randn(2) * self.var[i]).type(FloatTensor))
154+
155+
if self.episode_done > self.episodes_before_train and\
156+
self.var[i] > 0.05:
157+
self.var[i] *= 0.999998
158+
act = th.clamp(act, -1.0, 1.0)
159+
160+
actions[i, :] = act
161+
self.steps_done += 1
162+
163+
return actions

README.org

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#+TITLE: An implementation of MADDPG
2+
#+AUTHOR: xuehy
3+
4+
#+STARTUP: content
5+
6+
* 1. Introduction
7+
8+
This is an pytorch implementation of multi-agent deep deterministic policy gradient algorithm.
9+
10+
The experimental environment is a modified version of Waterworld based on [[https://github.com/sisl/MADRL][MADRL]].
11+
12+
* 2. Environment
13+
14+
The main features (different from MADRL) of the modified Waterworld environment are:
15+
16+
- evaders and poisons now bounce at the wall obeying physical rules
17+
- sizes of the evaders, pursuers and poisons are now the same
18+
19+
* 3. Dependency
20+
21+
- [[https://github.com/pytorch/pytorch][pytorch]]
22+
- [[https://github.com/facebookresearch/visdom][visdom]]
23+
- `python==3.6.1' (recommend using the anaconda/miniconda)
24+
- if you need to render the environments, `opencv' is required
25+
26+
* 4. Install
27+
28+
- Install [[https://github.com/sisl/MADRL][MADRL]].
29+
- Replace the `madrl_environments/pursuit' directory with the one in this repo.
30+
- `python main.py'
31+
32+
if scene rendering is enabled, recommend to install `opencv' through [[https://github.com/conda-forge/opencv-feedstock][conda-forge]].
33+
34+
* 5. Results
35+
36+
** two agents, cooperation = 2
37+
#+ATTR_HTML style="center"
38+
[[3.png]]
39+
40+
the average
41+
42+
[[4.png]]
43+
44+
45+
46+

__pycache__/MADDPG.cpython-36.pyc

5.33 KB
Binary file not shown.

__pycache__/memory.cpython-36.pyc

1.08 KB
Binary file not shown.

__pycache__/model.cpython-36.pyc

1.57 KB
Binary file not shown.

__pycache__/params.cpython-36.pyc

141 Bytes
Binary file not shown.
2.11 KB
Binary file not shown.

main.py

+130
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
from torch.autograd import Variable
2+
from madrl_environments.pursuit import MAWaterWorld_mod
3+
from MADDPG import MADDPG
4+
import numpy as np
5+
import torch as th
6+
import visdom
7+
from params import scale_reward
8+
9+
# do not render the scene
10+
e_render = False
11+
12+
food_reward = 10.
13+
poison_reward = -1.
14+
encounter_reward = 0.01
15+
n_coop = 2
16+
world = MAWaterWorld_mod(n_pursuers=2, n_evaders=50,
17+
n_poison=50, obstacle_radius=0.04,
18+
food_reward=food_reward,
19+
poison_reward=poison_reward,
20+
encounter_reward=encounter_reward,
21+
n_coop=n_coop,
22+
sensor_range=0.2, obstacle_loc=None, )
23+
24+
vis = visdom.Visdom(port=5274)
25+
reward_record = []
26+
27+
np.random.seed(1234)
28+
th.manual_seed(1234)
29+
world.seed(1234)
30+
n_agents = world.n_pursuers
31+
n_states = 213
32+
n_actions = 2
33+
capacity = 1000000
34+
batch_size = 1000
35+
36+
n_episode = 20000
37+
max_steps = 1000
38+
episodes_before_train = 100
39+
40+
win = None
41+
param = None
42+
43+
maddpg = MADDPG(n_agents, n_states, n_actions, batch_size, capacity,
44+
episodes_before_train)
45+
46+
FloatTensor = th.cuda.FloatTensor if maddpg.use_cuda else th.FloatTensor
47+
for i_episode in range(n_episode):
48+
obs = world.reset()
49+
obs = np.stack(obs)
50+
if isinstance(obs, np.ndarray):
51+
obs = th.from_numpy(obs).float()
52+
total_reward = 0.0
53+
rr = np.zeros((n_agents,))
54+
for t in range(max_steps):
55+
# render every 100 episodes to speed up training
56+
if i_episode % 100 == 0 and e_render:
57+
world.render()
58+
obs = Variable(obs).type(FloatTensor)
59+
action = maddpg.select_action(obs).data.cpu()
60+
obs_, reward, done, _ = world.step(action.numpy())
61+
62+
reward = th.FloatTensor(reward).type(FloatTensor)
63+
obs_ = np.stack(obs_)
64+
obs_ = th.from_numpy(obs_).float()
65+
if t != max_steps - 1:
66+
next_obs = obs_
67+
else:
68+
next_obs = None
69+
70+
total_reward += reward.sum()
71+
rr += reward.cpu().numpy()
72+
maddpg.memory.push(obs.data, action, next_obs, reward)
73+
obs = next_obs
74+
75+
c_loss, a_loss = maddpg.update_policy()
76+
maddpg.episode_done += 1
77+
print('Episode: %d, reward = %f' % (i_episode, total_reward))
78+
reward_record.append(total_reward)
79+
80+
if maddpg.episode_done == maddpg.episodes_before_train:
81+
print('training now begins...')
82+
print('MADDPG on WaterWorld\n' +
83+
'scale_reward=%f\n' % scale_reward +
84+
'agent=%d' % n_agents +
85+
', coop=%d' % n_coop +
86+
' \nlr=0.001, 0.0001, sensor_range=0.3\n' +
87+
'food=%f, poison=%f, encounter=%f' % (
88+
food_reward,
89+
poison_reward,
90+
encounter_reward))
91+
92+
if win is None:
93+
win = vis.line(X=np.arange(i_episode, i_episode+1),
94+
Y=np.array([
95+
np.append(total_reward, rr)]),
96+
opts=dict(
97+
ylabel='Reward',
98+
xlabel='Episode',
99+
title='MADDPG on WaterWorld_mod\n' +
100+
'agent=%d' % n_agents +
101+
', coop=%d' % n_coop +
102+
', sensor_range=0.2\n' +
103+
'food=%f, poison=%f, encounter=%f' % (
104+
food_reward,
105+
poison_reward,
106+
encounter_reward),
107+
legend=['Total'] +
108+
['Agent-%d' % i for i in range(n_agents)]))
109+
else:
110+
vis.line(X=np.array(
111+
[np.array(i_episode).repeat(n_agents+1)]),
112+
Y=np.array([np.append(total_reward,
113+
rr)]),
114+
win=win,
115+
update='append')
116+
if param is None:
117+
param = vis.line(X=np.arange(i_episode, i_episode+1),
118+
Y=np.array([maddpg.var[0]]),
119+
opts=dict(
120+
ylabel='Var',
121+
xlabel='Episode',
122+
title='MADDPG on WaterWorld: Exploration',
123+
legend=['Variance']))
124+
else:
125+
vis.line(X=np.array([i_episode]),
126+
Y=np.array([maddpg.var[0]]),
127+
win=param,
128+
update='append')
129+
130+
world.close()

memory.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from collections import namedtuple
2+
import random
3+
Experience = namedtuple('Experience',
4+
('states', 'actions', 'next_states', 'rewards'))
5+
6+
7+
class ReplayMemory:
8+
def __init__(self, capacity):
9+
self.capacity = capacity
10+
self.memory = []
11+
self.position = 0
12+
13+
def push(self, *args):
14+
if len(self.memory) < self.capacity:
15+
self.memory.append(None)
16+
self.memory[self.position] = Experience(*args)
17+
self.position = (self.position + 1) % self.capacity
18+
19+
def sample(self, batch_size):
20+
return random.sample(self.memory, batch_size)
21+
22+
def __len__(self):
23+
return len(self.memory)

0 commit comments

Comments
 (0)