@@ -43,33 +43,33 @@ def forward(self, x):
4343 return F .softmax (action_scores )
4444
4545
46- model = Policy ()
47- optimizer = optim .Adam (model .parameters (), lr = 1e-2 )
46+ policy = Policy ()
47+ optimizer = optim .Adam (policy .parameters (), lr = 1e-2 )
4848
4949
5050def select_action (state ):
5151 state = torch .from_numpy (state ).float ().unsqueeze (0 )
52- probs = model (Variable (state ))
52+ probs = policy (Variable (state ))
5353 action = probs .multinomial ()
54- model .saved_actions .append (action )
54+ policy .saved_actions .append (action )
5555 return action .data
5656
5757
5858def finish_episode ():
5959 R = 0
6060 rewards = []
61- for r in model .rewards [::- 1 ]:
61+ for r in policy .rewards [::- 1 ]:
6262 R = r + args .gamma * R
6363 rewards .insert (0 , R )
6464 rewards = torch .Tensor (rewards )
6565 rewards = (rewards - rewards .mean ()) / (rewards .std () + np .finfo (np .float32 ).eps )
66- for action , r in zip (model .saved_actions , rewards ):
66+ for action , r in zip (policy .saved_actions , rewards ):
6767 action .reinforce (r )
6868 optimizer .zero_grad ()
69- autograd .backward (model .saved_actions , [None for _ in model .saved_actions ])
69+ autograd .backward (policy .saved_actions , [None for _ in policy .saved_actions ])
7070 optimizer .step ()
71- del model .rewards [:]
72- del model .saved_actions [:]
71+ del policy .rewards [:]
72+ del policy .saved_actions [:]
7373
7474
7575running_reward = 10
@@ -80,7 +80,7 @@ def finish_episode():
8080 state , reward , done , _ = env .step (action [0 ,0 ])
8181 if args .render :
8282 env .render ()
83- model .rewards .append (reward )
83+ policy .rewards .append (reward )
8484 if done :
8585 break
8686
0 commit comments