Skip to content

Commit 5d1350f

Browse files
committed
Create 3layersdqn.py
1 parent c7aa4f2 commit 5d1350f

File tree

1 file changed

+283
-0
lines changed

1 file changed

+283
-0
lines changed

agents_using_gym/dqn/3layersdqn.py

Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
#!/usr/bin/env python
2+
import os,sys
3+
sys.path.insert(1, os.path.join(sys.path[0], '..'))
4+
import argparse
5+
6+
from multiagent.environment import MultiAgentEnv
7+
import multiagent.scenarios as scenarios
8+
import numpy as np
9+
import keras.backend.tensorflow_backend as backend
10+
from keras.models import Sequential
11+
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten
12+
from keras.optimizers import Adam
13+
from keras.callbacks import TensorBoard
14+
import tensorflow as tf
15+
from collections import deque
16+
import time
17+
import random
18+
from tqdm import tqdm
19+
from PIL import Image
20+
21+
22+
23+
if __name__ == '__main__':
24+
# parse arguments
25+
parser = argparse.ArgumentParser(description=None)
26+
parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.')
27+
args = parser.parse_args()
28+
29+
# load scenario from script
30+
scenario = scenarios.load(args.scenario).Scenario()
31+
# create world
32+
world = scenario.make_world()
33+
# create multiagent environment
34+
env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer = False)
35+
# render call to create viewer window (necessary only for interactive policies)
36+
env.render()
37+
38+
# execution loop
39+
obs_n = env.reset()
40+
41+
DISCOUNT = 0.99
42+
REPLAY_MEMORY_SIZE = 200 # How many last steps to keep for model training
43+
MIN_REPLAY_MEMORY_SIZE =100 # Minimum number of steps in a memory to start training
44+
MINIBATCH_SIZE = 64 # How many steps (samples) to use for training
45+
UPDATE_TARGET_EVERY = 10 # Terminal states (end of episodes)
46+
MODEL_NAME = '32'
47+
MIN_REWARD = 20 # For model save
48+
MEMORY_FRACTION = 0.20
49+
50+
# Environment settings
51+
EPISODES = 2000
52+
53+
# Exploration settings
54+
epsilon = 1 # not a constant, going to be decayed
55+
EPSILON_DECAY = 0.99975
56+
MIN_EPSILON = 0.001
57+
58+
# Stats settings
59+
AGGREGATE_STATS_EVERY = 50 # episodes
60+
SHOW_PREVIEW = False
61+
62+
63+
# For stats
64+
ep_rewards = [[-200]]*len(obs_n)
65+
66+
# For more repetitive results
67+
random.seed(1)
68+
np.random.seed(1)
69+
tf.set_random_seed(1)
70+
71+
# Memory fraction, used mostly when trai8ning multiple agents
72+
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=MEMORY_FRACTION)
73+
#backend.set_session(tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)))
74+
75+
# Create models folder
76+
if not os.path.isdir('models'):
77+
os.makedirs('models')
78+
79+
80+
# Own Tensorboard class
81+
class ModifiedTensorBoard(TensorBoard):
82+
83+
# Overriding init to set initial step and writer (we want one log file for all .fit() calls)
84+
def __init__(self, **kwargs):
85+
super().__init__(**kwargs)
86+
self.step = 1
87+
self.writer = tf.summary.FileWriter(self.log_dir)
88+
89+
# Overriding this method to stop creating default log writer
90+
def set_model(self, model):
91+
pass
92+
93+
# Overrided, saves logs with our step number
94+
# (otherwise every .fit() will start writing from 0th step)
95+
def on_epoch_end(self, epoch, logs=None):
96+
self.update_stats(**logs)
97+
98+
# Overrided
99+
# We train for one batch only, no need to save anything at epoch end
100+
def on_batch_end(self, batch, logs=None):
101+
pass
102+
103+
# Overrided, so won't close writer
104+
def on_train_end(self, _):
105+
pass
106+
107+
# Custom method for saving own metrics
108+
# Creates writer, writes custom metrics and closes writer
109+
def update_stats(self, **stats):
110+
self._write_logs(stats, self.step)
111+
112+
113+
# Agent class
114+
class DQNAgent:
115+
def __init__(self,i):
116+
self.index=i
117+
# Main model
118+
self.model = self.create_model()
119+
120+
# Target network
121+
self.target_model = self.create_model()
122+
self.target_model.set_weights(self.model.get_weights())
123+
124+
# An array with last n steps for training
125+
self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
126+
127+
# Custom tensorboard object
128+
self.tensorboard = ModifiedTensorBoard(log_dir="logs/{}-{}-{}".format(MODEL_NAME, self.index,int(time.time())))
129+
130+
# Used to count when to update target network with main network's weights
131+
self.target_update_counter = 0
132+
133+
def create_model(self):
134+
model = Sequential()
135+
model.add(Dense(len(obs_n[0])))
136+
#model.add(Conv2D(256, (3, 3), input_shape=(10, 10, 3))) # OBSERVATION_SPACE_VALUES = (10, 10, 3) a 10x10 RGB image.
137+
model.add(Activation('linear'))
138+
#model.add(MaxPooling2D(pool_size=(2, 2)))
139+
140+
#model.add(Dropout(0.2))
141+
142+
#model.add(Conv2D(256, (3, 3)))
143+
#model.add(Activation('relu'))
144+
#model.add(MaxPooling2D(pool_size=(2, 2)))
145+
#model.add(Dropout(0.2))
146+
147+
# model.add(Flatten()) # this converts our 3D feature maps to 1D feature vectors
148+
model.add(Dense(32,activation='linear'))
149+
150+
model.add(Dense(5, activation='linear')) # ACTION_SPACE_SIZE = how many choices (9)
151+
model.compile(loss="mse", optimizer=Adam(lr=0.001), metrics=['accuracy'])
152+
return model
153+
154+
# Adds step's data to a memory replay array
155+
# (observation space, action, reward, new observation space, done)
156+
def update_replay_memory(self, transition):
157+
self.replay_memory.append(transition)
158+
159+
# Trains main network every step during episode
160+
def train(self, terminal_state, step):
161+
162+
# Start training only if certain number of samples is already saved
163+
if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
164+
return
165+
166+
# Get a minibatch of random samples from memory replay table
167+
minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
168+
169+
# Get current states from minibatch, then query NN model for Q values
170+
current_states = (np.array([transition[0] for transition in minibatch])+1)/2
171+
current_qs_list = self.model.predict(current_states)
172+
173+
# Get future states from minibatch, then query NN model for Q values
174+
# When using target network, query it, otherwise main network should be queried
175+
new_current_states = (np.array([transition[3] for transition in minibatch])+1)/2
176+
future_qs_list = self.target_model.predict(new_current_states)
177+
178+
X = []
179+
y = []
180+
181+
# Now we need to enumerate our batches
182+
for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
183+
184+
# If not a terminal state, get new q from future states, otherwise set it to 0
185+
# almost like with Q Learning, but we use just part of equation here
186+
if not done:
187+
max_future_q = np.max(future_qs_list[index])
188+
new_q = reward + DISCOUNT * max_future_q
189+
else:
190+
new_q = reward
191+
192+
# Update Q value for given state
193+
current_qs = current_qs_list[index]
194+
current_qs[action] = new_q
195+
196+
# And append to our training data
197+
X.append(current_state)
198+
y.append(current_qs)
199+
200+
# Fit on all samples as one batch, log only on terminal state
201+
self.model.fit((np.array(X)+1)/2, np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)
202+
203+
# Update target network counter every episode
204+
if terminal_state:
205+
self.target_update_counter += 1
206+
207+
# If counter reaches set value, update target network with weights of main network
208+
if self.target_update_counter > UPDATE_TARGET_EVERY:
209+
self.target_model.set_weights(self.model.get_weights())
210+
self.target_update_counter = 0
211+
212+
# Queries main network for Q values given current observation space (environment state)
213+
def get_qs(self, state):
214+
215+
return self.model.predict((np.array(state).reshape(-1, *state.shape)+1)/2)[0]
216+
217+
218+
219+
220+
221+
222+
223+
224+
225+
226+
227+
228+
# create interactive policies for each agent
229+
policies = [DQNAgent(i) for i in range(env.n)]
230+
231+
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
232+
episode_reward=[0,0,0]
233+
step=1
234+
for i, policy in enumerate(policies):
235+
policy.tensorboard.step=episode
236+
# query for action from each agent's policy
237+
obs_n=env.reset()
238+
done = False
239+
while not done:
240+
241+
act_n = []
242+
action_n=[]
243+
for i, policy in enumerate(policies):
244+
act = np.zeros(5)
245+
if np.random.random() > epsilon:
246+
# Get action from Q table
247+
action = np.argmax(policy.get_qs(obs_n[i]))
248+
else:
249+
# Get random action
250+
action = np.random.randint(0, 5)
251+
act[action]+=1.0
252+
action_n.append(action)
253+
act_n.append(act)
254+
# step environment
255+
newobs_n, reward_n, done_n, _ = env.step(act_n)
256+
if step>=100:
257+
done=True
258+
for i, policy in enumerate(policies):
259+
episode_reward[i]+=reward_n[i]
260+
policy.update_replay_memory((obs_n[i], action_n[i], reward_n[i], newobs_n[i], done))
261+
policy.train(done, step)
262+
263+
obs_n=newobs_n
264+
step+=1
265+
#if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
266+
if episode % 100==1:
267+
env.render()
268+
for i, policy in enumerate(policies):
269+
ep_rewards[i].append(episode_reward[i])
270+
if not episode % AGGREGATE_STATS_EVERY or episode == 1:
271+
average_reward = sum(ep_rewards[i][-AGGREGATE_STATS_EVERY:])/len(ep_rewards[i][-AGGREGATE_STATS_EVERY:])
272+
min_reward = min(ep_rewards[i][-AGGREGATE_STATS_EVERY:])
273+
max_reward = max(ep_rewards[i][-AGGREGATE_STATS_EVERY:])
274+
policy.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)
275+
276+
# Save model, but only when min reward is greater or equal a set value
277+
if min_reward >= MIN_REWARD:
278+
policy.model.save(f'models/{MODEL_NAME+str(policy.index)}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
279+
280+
if epsilon > MIN_EPSILON:
281+
epsilon *= EPSILON_DECAY
282+
epsilon = max(MIN_EPSILON, epsilon)
283+

0 commit comments

Comments
 (0)