Skip to content

Commit f504908

Browse files
andrewcohErvin T.
andauthored
Develop magic string + trajectory (#3122)
* added team id and identifier concat to behavior parameters * splitting brain params into brain name and identifiers * set team id in prefab * recieves brain_name and identifier on python side * added team id and identifier concat to behavior parameters * splitting brain params into brain name and identifiers * set team id in prefab * recieves brain_name and identifier on python side * rebased with develop * Correctly calls concatBehaviorIdentifiers * added team id and identifier concat to behavior parameters * splitting brain params into brain name and identifiers * set team id in prefab * recieves brain_name and identifier on python side * rebased with develop * Correctly calls concatBehaviorIdentifiers * trainer_controller expects name_behavior_ids * add_policy and create_policy separated * adjusting tests to expect trainer.add_policy to be called * fixing tests * fixed naming name_behavior_id * added team id and identifier concat to behavior parameters * splitting brain params into brain name and identifiers * set team id in prefab * added team id and identifier concat to behavior parameters * recieves brain_name and identifier on python side * splitting brain params into brain name and identifiers * recieves brain_name and identifier on python side * rebased with develop * Correctly calls concatBehaviorIdentifiers * splitting brain params into brain name and identifiers * recieves brain_name and identifier on python side * rebased with develop * Correctly calls concatBehaviorIdentifiers * trainer_controller expects name_behavior_ids * add_policy and create_policy separated * adjusting tests to expect trainer.add_policy to be called * fixing tests * fixed naming name_behavior_id * passes all pytest and C# tests * fixed printing nonsense * fixed assets that got messed up * added ppo/sac_policy attributes to keep up with master * fixing ci tests * fixing more ci tests * fixed default trainer_util test to expect brain_name * fixing ci ppo_policy * fixed more ci problems/removed self.policies * Add agent group name to Trajectory * Rename to behavior_id * magic string protocol with trainer refactor * fixed logger warning * removed self.policy from rl_trainer * removed self.trainer from trainer.py * fixed increment_step tests * fixing circleci tests * type annotations to the trainer params * parameters descriptions for trainers Co-authored-by: Ervin T. <ervin@unity3d.com>
1 parent fb7afbb commit f504908

File tree

17 files changed

+324
-135
lines changed

17 files changed

+324
-135
lines changed

UnitySDK/Assets/ML-Agents/Editor/BehaviorParametersEditor.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ public override void OnInspectorGUI()
3333
EditorGUILayout.PropertyField(so.FindProperty("m_InferenceDevice"), true);
3434
EditorGUI.indentLevel--;
3535
EditorGUILayout.PropertyField(so.FindProperty("m_BehaviorType"));
36+
EditorGUILayout.PropertyField(so.FindProperty("m_TeamID"));
3637
EditorGUILayout.PropertyField(so.FindProperty("m_useChildSensors"), true);
3738
// EditorGUILayout.PropertyField(serializedObject.FindProperty("m_Heuristic"), true);
3839
EditorGUI.indentLevel--;

UnitySDK/Assets/ML-Agents/Scripts/Policy/BehaviorParameters.cs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using Barracuda;
22
using System;
3+
using System.Collections.Generic;
34
using UnityEngine;
45

56
namespace MLAgents
@@ -34,6 +35,8 @@ private enum BehaviorType
3435
[HideInInspector]
3536
[SerializeField]
3637
string m_BehaviorName = "My Behavior";
38+
[HideInInspector] [SerializeField]
39+
int m_TeamID = 0;
3740
[HideInInspector]
3841
[SerializeField]
3942
[Tooltip("Use all Sensor components attached to child GameObjects of this Agent.")]
@@ -51,7 +54,9 @@ public bool useChildSensors
5154

5255
public string behaviorName
5356
{
54-
get { return m_BehaviorName; }
57+
58+
get { return m_BehaviorName + "?team=" + m_TeamID;}
59+
5560
}
5661

5762
public IPolicy GeneratePolicy(Func<float[]> heuristic)
@@ -65,7 +70,7 @@ public IPolicy GeneratePolicy(Func<float[]> heuristic)
6570
case BehaviorType.Default:
6671
if (FindObjectOfType<Academy>().IsCommunicatorOn)
6772
{
68-
return new RemotePolicy(m_BrainParameters, m_BehaviorName);
73+
return new RemotePolicy(m_BrainParameters, behaviorName);
6974
}
7075
if (m_Model != null)
7176
{

ml-agents/mlagents/trainers/agent_processor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def __init__(
2121
self,
2222
trainer: Trainer,
2323
policy: TFPolicy,
24+
behavior_id: str,
2425
stats_reporter: StatsReporter,
2526
max_trajectory_length: int = sys.maxsize,
2627
):
@@ -44,6 +45,7 @@ def __init__(
4445
self.stats_reporter = stats_reporter
4546
self.trainer = trainer
4647
self.max_trajectory_length = max_trajectory_length
48+
self.behavior_id = behavior_id
4749

4850
def add_experiences(
4951
self,
@@ -133,6 +135,7 @@ def add_experiences(
133135
steps=self.experience_buffers[agent_id],
134136
agent_id=agent_id,
135137
next_obs=next_obs,
138+
behavior_id=self.behavior_id,
136139
)
137140
# This will eventually be replaced with a queue
138141
self.trainer.process_trajectory(trajectory)

ml-agents/mlagents/trainers/ppo/trainer.py

Lines changed: 71 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from mlagents.trainers.ppo.policy import PPOPolicy
1111
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy, get_devices
1212
from mlagents.trainers.rl_trainer import RLTrainer
13+
from mlagents.trainers.brain import BrainParameters
14+
from mlagents.trainers.tf_policy import TFPolicy
1315
from mlagents.trainers.trajectory import Trajectory
1416

1517
logger = logging.getLogger("mlagents.trainers")
@@ -20,26 +22,28 @@ class PPOTrainer(RLTrainer):
2022

2123
def __init__(
2224
self,
23-
brain,
24-
reward_buff_cap,
25-
trainer_parameters,
26-
training,
27-
load,
28-
seed,
29-
run_id,
30-
multi_gpu,
25+
brain_name: str,
26+
reward_buff_cap: int,
27+
trainer_parameters: dict,
28+
training: bool,
29+
load: bool,
30+
seed: int,
31+
run_id: str,
32+
multi_gpu: bool,
3133
):
3234
"""
3335
Responsible for collecting experiences and training PPO model.
34-
:param trainer_parameters: The parameters for the trainer (dictionary).
36+
:param brain_name: The name of the brain associated with trainer config
3537
:param reward_buff_cap: Max reward history to track in the reward buffer
38+
:param trainer_parameters: The parameters for the trainer (dictionary).
3639
:param training: Whether the trainer is set for training.
3740
:param load: Whether the model should be loaded.
3841
:param seed: The seed the model will be initialized with
3942
:param run_id: The identifier of the current run
43+
:param multi_gpu: Boolean for multi-gpu policy model
4044
"""
4145
super(PPOTrainer, self).__init__(
42-
brain, trainer_parameters, training, run_id, reward_buff_cap
46+
brain_name, trainer_parameters, training, run_id, reward_buff_cap
4347
)
4448
self.param_keys = [
4549
"batch_size",
@@ -63,19 +67,10 @@ def __init__(
6367
"reward_signals",
6468
]
6569
self.check_param_keys()
66-
67-
if multi_gpu and len(get_devices()) > 1:
68-
self.ppo_policy = MultiGpuPPOPolicy(
69-
seed, brain, trainer_parameters, self.is_training, load
70-
)
71-
else:
72-
self.ppo_policy = PPOPolicy(
73-
seed, brain, trainer_parameters, self.is_training, load
74-
)
75-
self.policy = self.ppo_policy
76-
77-
for _reward_signal in self.policy.reward_signals.keys():
78-
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
70+
self.load = load
71+
self.multi_gpu = multi_gpu
72+
self.seed = seed
73+
self.policy: TFPolicy = None
7974

8075
def process_trajectory(self, trajectory: Trajectory) -> None:
8176
"""
@@ -161,7 +156,9 @@ def process_trajectory(self, trajectory: Trajectory) -> None:
161156

162157
# If this was a terminal trajectory, append stats and reset reward collection
163158
if trajectory.done_reached:
164-
self._update_end_episode_stats(agent_id)
159+
self._update_end_episode_stats(
160+
agent_id, self.get_policy(trajectory.behavior_id)
161+
)
165162

166163
def is_ready_update(self):
167164
"""
@@ -218,6 +215,56 @@ def update_policy(self):
218215
self.stats_reporter.add_stat(stat, val)
219216
self.clear_update_buffer()
220217

218+
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
219+
"""
220+
Creates a PPO policy to trainers list of policies.
221+
:param brain_parameters: specifications for policy construction
222+
:return policy
223+
"""
224+
225+
if self.multi_gpu and len(get_devices()) > 1:
226+
policy: PPOPolicy = MultiGpuPPOPolicy(
227+
self.seed,
228+
brain_parameters,
229+
self.trainer_parameters,
230+
self.is_training,
231+
self.load,
232+
)
233+
else:
234+
policy = PPOPolicy(
235+
self.seed,
236+
brain_parameters,
237+
self.trainer_parameters,
238+
self.is_training,
239+
self.load,
240+
)
241+
242+
for _reward_signal in policy.reward_signals.keys():
243+
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
244+
245+
return policy
246+
247+
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
248+
"""
249+
Adds policy to trainer.
250+
:param brain_parameters: specifications for policy construction
251+
"""
252+
if self.policy:
253+
logger.warning(
254+
"add_policy has been called twice. {} is not a multi-agent trainer".format(
255+
self.__class__.__name__
256+
)
257+
)
258+
self.policy = policy
259+
260+
def get_policy(self, name_behavior_id: str) -> TFPolicy:
261+
"""
262+
Gets policy from trainer associated with name_behavior_id
263+
:param name_behavior_id: full identifier of policy
264+
"""
265+
266+
return self.policy
267+
221268

222269
def discount_rewards(r, gamma=0.99, value_next=0.0):
223270
"""

ml-agents/mlagents/trainers/rl_trainer.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import Dict
44
from collections import defaultdict
55

6+
from mlagents.trainers.tf_policy import TFPolicy
67
from mlagents.trainers.buffer import AgentBuffer
78
from mlagents.trainers.trainer import Trainer, UnityTrainerException
89
from mlagents.trainers.components.reward_signals import RewardSignalResult
@@ -47,7 +48,7 @@ def end_episode(self) -> None:
4748
for agent_id in rewards:
4849
rewards[agent_id] = 0
4950

50-
def _update_end_episode_stats(self, agent_id: str) -> None:
51+
def _update_end_episode_stats(self, agent_id: str, policy: TFPolicy) -> None:
5152
self.episode_steps[agent_id] = 0
5253
for name, rewards in self.collected_rewards.items():
5354
if name == "environment":
@@ -58,7 +59,7 @@ def _update_end_episode_stats(self, agent_id: str) -> None:
5859
rewards[agent_id] = 0
5960
else:
6061
self.stats_reporter.add_stat(
61-
self.policy.reward_signals[name].stat_name, rewards.get(agent_id, 0)
62+
policy.reward_signals[name].stat_name, rewards.get(agent_id, 0)
6263
)
6364
rewards[agent_id] = 0
6465

0 commit comments

Comments
 (0)