Skip to content

Commit e620282

Browse files
authored
Merge pull request #5 from RLOpensource/boxing
Boxing
2 parents 72f8aec + 5efe363 commit e620282

File tree

6 files changed

+188
-21
lines changed

6 files changed

+188
-21
lines changed

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ wait
6161
<img src="source/pong/openaigym.video.0.20974.video000440.gif" width="33%" height='300'>
6262
<img src="source/seaquest/openaigym.video.0.27728.video001090.gif" width="33%" height='300'>
6363
<img src="source/spaceinvader/openaigym.video.0.30111.video004130.gif" width="33%" height='300'>
64+
<img src="source/boxing/openaigym.video.0.3092.video000930.gif" width="33%" height='300'>
6465
</div>
6566

6667
### Plotting
@@ -93,6 +94,13 @@ wait
9394
<img src="source/spaceinvader/invader_2.png" width="100%" height="150">
9495
</div>
9596

97+
* Boxing
98+
99+
<div align="center">
100+
<img src="source/boxing/boxing_1.png" width="100%" height="150">
101+
<img src="source/boxing/boxing_2.png" width="100%" height="150">
102+
</div>
103+
96104

97105
# Todo
98106

source/boxing/boxing_1.png

173 KB
Loading

source/boxing/boxing_2.png

338 KB
Loading
1.98 MB
Loading

start.sh

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
1-
python trainer_invader.py --num_actors=20 --task=0 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=learner --reward_clipping=abs_one &
1+
python trainer_boxing.py --num_actors=20 --task=0 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=learner --reward_clipping=abs_one &
2+
3+
python trainer_boxing.py --num_actors=20 --task=0 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
4+
python trainer_boxing.py --num_actors=20 --task=1 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
5+
python trainer_boxing.py --num_actors=20 --task=2 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
6+
python trainer_boxing.py --num_actors=20 --task=3 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
7+
python trainer_boxing.py --num_actors=20 --task=4 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
8+
python trainer_boxing.py --num_actors=20 --task=5 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
9+
python trainer_boxing.py --num_actors=20 --task=6 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
10+
python trainer_boxing.py --num_actors=20 --task=7 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
11+
python trainer_boxing.py --num_actors=20 --task=8 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
12+
python trainer_boxing.py --num_actors=20 --task=9 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
13+
python trainer_boxing.py --num_actors=20 --task=10 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
14+
python trainer_boxing.py --num_actors=20 --task=11 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
15+
python trainer_boxing.py --num_actors=20 --task=12 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
16+
python trainer_boxing.py --num_actors=20 --task=13 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
17+
python trainer_boxing.py --num_actors=20 --task=14 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
18+
python trainer_boxing.py --num_actors=20 --task=15 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
19+
python trainer_boxing.py --num_actors=20 --task=16 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
20+
python trainer_boxing.py --num_actors=20 --task=17 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
21+
python trainer_boxing.py --num_actors=20 --task=18 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
22+
python trainer_boxing.py --num_actors=20 --task=19 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
223

3-
python trainer_invader.py --num_actors=20 --task=0 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
4-
python trainer_invader.py --num_actors=20 --task=1 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
5-
python trainer_invader.py --num_actors=20 --task=2 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
6-
python trainer_invader.py --num_actors=20 --task=3 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
7-
python trainer_invader.py --num_actors=20 --task=4 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
8-
python trainer_invader.py --num_actors=20 --task=5 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
9-
python trainer_invader.py --num_actors=20 --task=6 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
10-
python trainer_invader.py --num_actors=20 --task=7 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
11-
python trainer_invader.py --num_actors=20 --task=8 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
12-
python trainer_invader.py --num_actors=20 --task=9 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
13-
python trainer_invader.py --num_actors=20 --task=10 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
14-
python trainer_invader.py --num_actors=20 --task=11 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
15-
python trainer_invader.py --num_actors=20 --task=12 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
16-
python trainer_invader.py --num_actors=20 --task=13 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
17-
python trainer_invader.py --num_actors=20 --task=14 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
18-
python trainer_invader.py --num_actors=20 --task=15 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
19-
python trainer_invader.py --num_actors=20 --task=16 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
20-
python trainer_invader.py --num_actors=20 --task=17 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
21-
python trainer_invader.py --num_actors=20 --task=18 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &
22-
python trainer_invader.py --num_actors=20 --task=19 --batch_size=32 --queue_size=128 --trajectory=20 --learning_frame=1000000000 --start_learning=0.0006 --end_learning=0.0 --discount_factor=0.99 --entropy_coef=0.05 --baseline_loss_coef=1.0 --gradient_clip_norm=40.0 --job_name=actor --reward_clipping=abs_one &

trainer_boxing.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
import tensorflow as tf
2+
import numpy as np
3+
4+
import tensorboardX
5+
import buffer_queue
6+
import collections
7+
import py_process
8+
import wrappers
9+
import config
10+
import model
11+
import time
12+
import gym
13+
14+
flags = tf.app.flags
15+
FLAGS = tf.app.flags.FLAGS
16+
17+
18+
19+
flags.DEFINE_integer('num_actors', 4, 'Number of actors.')
20+
flags.DEFINE_integer('task', -1, 'Task id. Use -1 for local training.')
21+
flags.DEFINE_integer('batch_size', 32, 'how many batch learner should be training')
22+
flags.DEFINE_integer('queue_size', 128, 'fifoqueue size')
23+
flags.DEFINE_integer('trajectory', 20, 'trajectory length')
24+
flags.DEFINE_integer('learning_frame', int(1e9), 'trajectory length')
25+
26+
flags.DEFINE_float('start_learning_rate', 0.0006, 'start_learning_rate')
27+
flags.DEFINE_float('end_learning_rate', 0, 'end_learning_rate')
28+
flags.DEFINE_float('discount_factor', 0.99, 'discount factor')
29+
flags.DEFINE_float('entropy_coef', 0.05, 'entropy coefficient')
30+
flags.DEFINE_float('baseline_loss_coef', 0.5, 'baseline coefficient')
31+
flags.DEFINE_float('gradient_clip_norm', 40.0, 'gradient clip norm')
32+
33+
flags.DEFINE_enum('job_name', 'learner', ['learner', 'actor'], 'Job name. Ignored when task is set to -1')
34+
flags.DEFINE_enum('reward_clipping', 'abs_one', ['abs_one', 'soft_asymmetric'], 'Reward clipping.')
35+
36+
def main(_):
37+
38+
local_job_device = '/job:{}/task:{}'.format(FLAGS.job_name, FLAGS.task)
39+
shared_job_device = '/job:learner/task:0'
40+
is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task
41+
is_learner = FLAGS.job_name == 'learner'
42+
43+
cluster = tf.train.ClusterSpec({
44+
'actor': ['localhost:{}'.format(8001+i) for i in range(FLAGS.num_actors)],
45+
'learner': ['localhost:8000']})
46+
47+
server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task)
48+
49+
filters = [shared_job_device, local_job_device]
50+
51+
input_shape = [84, 84, 4]
52+
output_size = 18
53+
env_name = 'BoxingDeterministic-v4'
54+
55+
with tf.device(shared_job_device):
56+
queue = buffer_queue.FIFOQueue(
57+
FLAGS.trajectory, input_shape, output_size,
58+
FLAGS.queue_size, FLAGS.batch_size, FLAGS.num_actors)
59+
learner = model.IMPALA(
60+
trajectory=FLAGS.trajectory,
61+
input_shape=input_shape,
62+
num_action=output_size,
63+
discount_factor=FLAGS.discount_factor,
64+
start_learning_rate=FLAGS.start_learning_rate,
65+
end_learning_rate=FLAGS.end_learning_rate,
66+
learning_frame=FLAGS.learning_frame,
67+
baseline_loss_coef=FLAGS.baseline_loss_coef,
68+
entropy_coef=FLAGS.entropy_coef,
69+
gradient_clip_norm=FLAGS.gradient_clip_norm)
70+
71+
sess = tf.Session(server.target)
72+
queue.set_session(sess)
73+
learner.set_session(sess)
74+
75+
if is_learner:
76+
77+
writer = tensorboardX.SummaryWriter('runs/learner')
78+
train_step = 0
79+
80+
while True:
81+
size = queue.get_size()
82+
if size > 3 * FLAGS.batch_size:
83+
train_step += 1
84+
batch = queue.sample_batch()
85+
s = time.time()
86+
pi_loss, baseline_loss, entropy, learning_rate = learner.train(
87+
state=np.stack(batch.state),
88+
reward=np.stack(batch.reward),
89+
action=np.stack(batch.action),
90+
done=np.stack(batch.done),
91+
behavior_policy=np.stack(batch.behavior_policy))
92+
writer.add_scalar('data/pi_loss', pi_loss, train_step)
93+
writer.add_scalar('data/baseline_loss', baseline_loss, train_step)
94+
writer.add_scalar('data/entropy', entropy, train_step)
95+
writer.add_scalar('data/learning_rate', learning_rate, train_step)
96+
writer.add_scalar('data/time', time.time() - s, train_step)
97+
else:
98+
99+
trajectory_data = collections.namedtuple(
100+
'trajectory_data',
101+
['state', 'next_state', 'reward', 'done', 'action', 'behavior_policy'])
102+
103+
env = wrappers.make_uint8_env(env_name)
104+
if FLAGS.task == 0:
105+
env = gym.wrappers.Monitor(env, 'save-mov', video_callable=lambda episode_id: episode_id%10==0)
106+
state = env.reset()
107+
108+
episode = 0
109+
score = 0
110+
episode_step = 0
111+
total_max_prob = 0
112+
113+
writer = tensorboardX.SummaryWriter('runs/actor_{}'.format(FLAGS.task))
114+
115+
while True:
116+
117+
unroll_data = trajectory_data([], [], [], [], [], [])
118+
119+
for _ in range(FLAGS.trajectory):
120+
121+
action, behavior_policy, max_prob = learner.get_policy_and_action(state)
122+
123+
episode_step += 1
124+
total_max_prob += max_prob
125+
126+
next_state, reward, done, info = env.step(action)
127+
128+
score += reward
129+
130+
unroll_data.state.append(state)
131+
unroll_data.next_state.append(next_state)
132+
unroll_data.reward.append(reward)
133+
unroll_data.done.append(done)
134+
unroll_data.action.append(action)
135+
unroll_data.behavior_policy.append(behavior_policy)
136+
137+
state = next_state
138+
139+
if done:
140+
141+
print(episode, score)
142+
writer.add_scalar('data/prob', total_max_prob / episode_step, episode)
143+
writer.add_scalar('data/score', score, episode)
144+
writer.add_scalar('data/episode_step', episode_step, episode)
145+
episode += 1
146+
score = 0
147+
episode_step = 0
148+
total_max_prob = 0
149+
state = env.reset()
150+
151+
queue.append_to_queue(
152+
task=FLAGS.task, unrolled_state=unroll_data.state,
153+
unrolled_next_state=unroll_data.next_state, unrolled_reward=unroll_data.reward,
154+
unrolled_done=unroll_data.done, unrolled_action=unroll_data.action,
155+
unrolled_behavior_policy=unroll_data.behavior_policy)
156+
157+
if __name__ == '__main__':
158+
tf.app.run()

0 commit comments

Comments
 (0)