Skip to content

Commit 277d66f

Browse files
author
Ervin T
authored
[coma2] Make group extrinsic reward part of extrinsic (#5033)
* Make group extrinsic part of extrinsic * Fix test and init * Fix tests and bug * Add baseline loss to TensorBoard
1 parent 7ec4b34 commit 277d66f

File tree

10 files changed

+122
-50
lines changed

10 files changed

+122
-50
lines changed

config/ppo/PushBlockCollab.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ behaviors:
1616
num_layers: 2
1717
vis_encode_type: simple
1818
reward_signals:
19-
group:
19+
extrinsic:
2020
gamma: 0.99
2121
strength: 1.0
2222
keep_checkpoints: 5
23-
max_steps: 20000000 #2000000
23+
max_steps: 20000000
2424
time_horizon: 64
2525
summary_freq: 60000
2626
threaded: true

ml-agents/mlagents/trainers/coma/optimizer_torch.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
from mlagents_envs.base_env import ObservationSpec, ActionSpec
1515
from mlagents.trainers.policy.torch_policy import TorchPolicy
1616
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
17-
from mlagents.trainers.settings import TrainerSettings, PPOSettings
17+
from mlagents.trainers.settings import (
18+
ExtrinsicSettings,
19+
RewardSignalSettings,
20+
RewardSignalType,
21+
TrainerSettings,
22+
PPOSettings,
23+
)
1824
from mlagents.trainers.torch.networks import Critic, MultiInputNetworkBody
1925
from mlagents.trainers.torch.decoders import ValueHeads
2026
from mlagents.trainers.torch.agent_action import AgentAction
@@ -23,6 +29,10 @@
2329
from mlagents.trainers.trajectory import ObsUtil, GroupObsUtil
2430
from mlagents.trainers.settings import NetworkSettings
2531

32+
from mlagents_envs.logging_util import get_logger
33+
34+
logger = get_logger(__name__)
35+
2636

2737
class TorchCOMAOptimizer(TorchOptimizer):
2838
class COMAValueNetwork(torch.nn.Module, Critic):
@@ -157,6 +167,24 @@ def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
157167
self.value_memory_dict: Dict[str, torch.Tensor] = {}
158168
self.baseline_memory_dict: Dict[str, torch.Tensor] = {}
159169

170+
def create_reward_signals(
171+
self, reward_signal_configs: Dict[RewardSignalType, RewardSignalSettings]
172+
) -> None:
173+
"""
174+
Create reward signals. Override default to provide warnings for Curiosity and
175+
GAIL, and make sure Extrinsic adds team rewards.
176+
:param reward_signal_configs: Reward signal config.
177+
"""
178+
for reward_signal, settings in reward_signal_configs.items():
179+
if reward_signal != RewardSignalType.EXTRINSIC:
180+
logger.warning(
181+
f"Reward Signal {reward_signal.value} is not supported with the COMA2 trainer; \
182+
results may be unexpected."
183+
)
184+
elif isinstance(settings, ExtrinsicSettings):
185+
settings.add_groupmate_rewards = True
186+
super().create_reward_signals(reward_signal_configs)
187+
160188
@property
161189
def critic(self):
162190
return self._critic
@@ -335,6 +363,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
335363
# TODO: After PyTorch is default, change to something more correct.
336364
"Losses/Policy Loss": torch.abs(policy_loss).item(),
337365
"Losses/Value Loss": value_loss.item(),
366+
"Losses/Baseline Loss": baseline_loss.item(),
338367
"Policy/Learning Rate": decay_lr,
339368
"Policy/Epsilon": decay_eps,
340369
"Policy/Beta": decay_bet,

ml-agents/mlagents/trainers/optimizer/torch_optimizer.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@
1010

1111
from mlagents.trainers.policy.torch_policy import TorchPolicy
1212
from mlagents.trainers.optimizer import Optimizer
13-
from mlagents.trainers.settings import TrainerSettings
13+
from mlagents.trainers.settings import (
14+
TrainerSettings,
15+
RewardSignalSettings,
16+
RewardSignalType,
17+
)
1418
from mlagents.trainers.torch.utils import ModelUtils
1519

1620

@@ -44,7 +48,9 @@ def critic(self):
4448
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
4549
pass
4650

47-
def create_reward_signals(self, reward_signal_configs):
51+
def create_reward_signals(
52+
self, reward_signal_configs: Dict[RewardSignalType, RewardSignalSettings]
53+
) -> None:
4854
"""
4955
Create reward signals
5056
:param reward_signal_configs: Reward signal config.

ml-agents/mlagents/trainers/settings.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -165,15 +165,13 @@ def _reward_signal_steps_per_update_default(self):
165165
# INTRINSIC REWARD SIGNALS #############################################################
166166
class RewardSignalType(Enum):
167167
EXTRINSIC: str = "extrinsic"
168-
GROUP_EXTRINSIC: str = "group"
169168
GAIL: str = "gail"
170169
CURIOSITY: str = "curiosity"
171170
RND: str = "rnd"
172171

173172
def to_settings(self) -> type:
174173
_mapping = {
175-
RewardSignalType.EXTRINSIC: RewardSignalSettings,
176-
RewardSignalType.GROUP_EXTRINSIC: RewardSignalSettings,
174+
RewardSignalType.EXTRINSIC: ExtrinsicSettings,
177175
RewardSignalType.GAIL: GAILSettings,
178176
RewardSignalType.CURIOSITY: CuriositySettings,
179177
RewardSignalType.RND: RNDSettings,
@@ -217,6 +215,12 @@ def structure(d: Mapping, t: type) -> Any:
217215
return d_final
218216

219217

218+
@attr.s(auto_attribs=True)
219+
class ExtrinsicSettings(RewardSignalSettings):
220+
# For use with COMA2. Add groupmate rewards to the final extrinsic reward.
221+
add_groupmate_rewards = False
222+
223+
220224
@attr.s(auto_attribs=True)
221225
class GAILSettings(RewardSignalSettings):
222226
learning_rate: float = 3e-4
@@ -625,7 +629,7 @@ def _set_default_hyperparameters(self):
625629

626630
network_settings: NetworkSettings = attr.ib(factory=NetworkSettings)
627631
reward_signals: Dict[RewardSignalType, RewardSignalSettings] = attr.ib(
628-
factory=lambda: {RewardSignalType.EXTRINSIC: RewardSignalSettings()}
632+
factory=lambda: {RewardSignalType.EXTRINSIC: ExtrinsicSettings()}
629633
)
630634
init_path: Optional[str] = None
631635
keep_checkpoints: int = 5

ml-agents/mlagents/trainers/tests/torch/test_coma.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import attr
55

66
from mlagents.trainers.coma.optimizer_torch import TorchCOMAOptimizer
7-
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
7+
from mlagents.trainers.settings import ExtrinsicSettings, RewardSignalType
88

99
from mlagents.trainers.policy.torch_policy import TorchPolicy
1010
from mlagents.trainers.tests import mock_brain as mb
@@ -49,7 +49,9 @@ def create_test_coma_optimizer(dummy_config, use_rnn, use_discrete, use_visual):
4949

5050
trainer_settings = attr.evolve(dummy_config)
5151
trainer_settings.reward_signals = {
52-
RewardSignalType.GROUP_EXTRINSIC: RewardSignalSettings(strength=1.0, gamma=0.99)
52+
RewardSignalType.EXTRINSIC: ExtrinsicSettings(
53+
strength=1.0, gamma=0.99, add_groupmate_rewards=True
54+
)
5355
}
5456

5557
trainer_settings.network_settings.memory = (
@@ -122,7 +124,11 @@ def test_coma_get_value_estimates(dummy_config, rnn, visual, discrete):
122124
max_step_complete=True,
123125
num_other_agents_in_group=NUM_AGENTS,
124126
)
125-
value_estimates, baseline_estimates, next_value_estimates = optimizer.get_trajectory_and_baseline_value_estimates(
127+
(
128+
value_estimates,
129+
baseline_estimates,
130+
next_value_estimates,
131+
) = optimizer.get_trajectory_and_baseline_value_estimates(
126132
trajectory.to_agentbuffer(),
127133
trajectory.next_obs,
128134
trajectory.next_group_obs,
@@ -138,7 +144,11 @@ def test_coma_get_value_estimates(dummy_config, rnn, visual, discrete):
138144
# if all_memories is not None:
139145
# assert len(all_memories) == 15
140146

141-
value_estimates, baseline_estimates, next_value_estimates = optimizer.get_trajectory_and_baseline_value_estimates(
147+
(
148+
value_estimates,
149+
baseline_estimates,
150+
next_value_estimates,
151+
) = optimizer.get_trajectory_and_baseline_value_estimates(
142152
trajectory.to_agentbuffer(),
143153
trajectory.next_obs,
144154
trajectory.next_group_obs,
@@ -150,7 +160,11 @@ def test_coma_get_value_estimates(dummy_config, rnn, visual, discrete):
150160

151161
# Check if we ignore terminal states properly
152162
optimizer.reward_signals["group"].use_terminal_states = False
153-
value_estimates, baseline_estimates, next_value_estimates = optimizer.get_trajectory_and_baseline_value_estimates(
163+
(
164+
value_estimates,
165+
baseline_estimates,
166+
next_value_estimates,
167+
) = optimizer.get_trajectory_and_baseline_value_estimates(
154168
trajectory.to_agentbuffer(),
155169
trajectory.next_obs,
156170
trajectory.next_group_obs,

ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1+
from mlagents.trainers.buffer import BufferKey
12
import pytest
3+
import numpy as np
24
from mlagents.trainers.torch.components.reward_providers import (
35
ExtrinsicRewardProvider,
46
create_reward_provider,
57
)
68
from mlagents_envs.base_env import BehaviorSpec, ActionSpec
7-
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
9+
from mlagents.trainers.settings import ExtrinsicSettings, RewardSignalType
810
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
911
create_agent_buffer,
1012
)
@@ -27,7 +29,7 @@
2729
],
2830
)
2931
def test_construction(behavior_spec: BehaviorSpec) -> None:
30-
settings = RewardSignalSettings()
32+
settings = ExtrinsicSettings()
3133
settings.gamma = 0.2
3234
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
3335
assert extrinsic_rp.gamma == 0.2
@@ -46,7 +48,7 @@ def test_construction(behavior_spec: BehaviorSpec) -> None:
4648
],
4749
)
4850
def test_factory(behavior_spec: BehaviorSpec) -> None:
49-
settings = RewardSignalSettings()
51+
settings = ExtrinsicSettings()
5052
extrinsic_rp = create_reward_provider(
5153
RewardSignalType.EXTRINSIC, behavior_spec, settings
5254
)
@@ -67,7 +69,24 @@ def test_factory(behavior_spec: BehaviorSpec) -> None:
6769
)
6870
def test_reward(behavior_spec: BehaviorSpec, reward: float) -> None:
6971
buffer = create_agent_buffer(behavior_spec, 1000, reward)
70-
settings = RewardSignalSettings()
72+
settings = ExtrinsicSettings()
7173
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
7274
generated_rewards = extrinsic_rp.evaluate(buffer)
7375
assert (generated_rewards == reward).all()
76+
77+
# Test group rewards. Rewards should be double of the environment rewards, but shouldn't count
78+
# the groupmate rewards.
79+
buffer[BufferKey.GROUP_REWARD] = buffer[BufferKey.ENVIRONMENT_REWARDS]
80+
# 2 agents with identical rewards
81+
buffer[BufferKey.GROUPMATE_REWARDS].set(
82+
[np.ones(1, dtype=np.float32) * reward] * 2
83+
for _ in range(buffer.num_experiences)
84+
)
85+
generated_rewards = extrinsic_rp.evaluate(buffer)
86+
assert (generated_rewards == 2 * reward).all()
87+
88+
# Test groupmate rewards. Total reward should be indiv_reward + 2 * teammate_reward + group_reward
89+
settings.add_groupmate_rewards = True
90+
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
91+
generated_rewards = extrinsic_rp.evaluate(buffer)
92+
assert (generated_rewards == 4 * reward).all()

ml-agents/mlagents/trainers/torch/components/reward_providers/__init__.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@
44
from mlagents.trainers.torch.components.reward_providers.extrinsic_reward_provider import ( # noqa F401
55
ExtrinsicRewardProvider,
66
)
7-
from mlagents.trainers.torch.components.reward_providers.group_extrinsic_reward_provider import ( # noqa F401
8-
GroupExtrinsicRewardProvider,
9-
)
107
from mlagents.trainers.torch.components.reward_providers.curiosity_reward_provider import ( # noqa F401
118
CuriosityRewardProvider,
129
)

ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,42 @@
55
from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
66
BaseRewardProvider,
77
)
8+
from mlagents_envs.base_env import BehaviorSpec
9+
from mlagents.trainers.settings import ExtrinsicSettings
810

911

1012
class ExtrinsicRewardProvider(BaseRewardProvider):
13+
"""
14+
Evaluates extrinsic reward. For single-agent, this equals the individual reward
15+
given to the agent. For the COMA2 algorithm, we want not only the individual reward
16+
but also the team and the individual rewards of the other agents.
17+
"""
18+
19+
def __init__(self, specs: BehaviorSpec, settings: ExtrinsicSettings) -> None:
20+
super().__init__(specs, settings)
21+
self._add_groupmate_rewards = settings.add_groupmate_rewards
22+
1123
def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
12-
return np.array(mini_batch[BufferKey.ENVIRONMENT_REWARDS], dtype=np.float32)
24+
indiv_rewards = np.array(
25+
mini_batch[BufferKey.ENVIRONMENT_REWARDS], dtype=np.float32
26+
)
27+
total_rewards = indiv_rewards
28+
if (
29+
BufferKey.GROUPMATE_REWARDS in mini_batch
30+
and BufferKey.GROUP_REWARD in mini_batch
31+
):
32+
if self._add_groupmate_rewards:
33+
groupmate_rewards_list = mini_batch[BufferKey.GROUPMATE_REWARDS]
34+
groupmate_rewards_sum = np.array(
35+
[sum(_rew) for _rew in groupmate_rewards_list], dtype=np.float32
36+
)
37+
total_rewards += groupmate_rewards_sum
38+
group_rewards = np.array(
39+
mini_batch[BufferKey.GROUP_REWARD], dtype=np.float32
40+
)
41+
# Add all the group rewards to the individual rewards
42+
total_rewards += group_rewards
43+
return total_rewards
1344

1445
def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
1546
return {}

ml-agents/mlagents/trainers/torch/components/reward_providers/group_extrinsic_reward_provider.py

Lines changed: 0 additions & 24 deletions
This file was deleted.

ml-agents/mlagents/trainers/torch/components/reward_providers/reward_provider_factory.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@
1515
from mlagents.trainers.torch.components.reward_providers.gail_reward_provider import (
1616
GAILRewardProvider,
1717
)
18-
from mlagents.trainers.torch.components.reward_providers.group_extrinsic_reward_provider import (
19-
GroupExtrinsicRewardProvider,
20-
)
2118
from mlagents.trainers.torch.components.reward_providers.rnd_reward_provider import (
2219
RNDRewardProvider,
2320
)
@@ -26,7 +23,6 @@
2623

2724
NAME_TO_CLASS: Dict[RewardSignalType, Type[BaseRewardProvider]] = {
2825
RewardSignalType.EXTRINSIC: ExtrinsicRewardProvider,
29-
RewardSignalType.GROUP_EXTRINSIC: GroupExtrinsicRewardProvider,
3026
RewardSignalType.CURIOSITY: CuriosityRewardProvider,
3127
RewardSignalType.GAIL: GAILRewardProvider,
3228
RewardSignalType.RND: RNDRewardProvider,

0 commit comments

Comments
 (0)