@@ -43,33 +43,33 @@ def forward(self, x):
4343 return  F .softmax (action_scores )
4444
4545
46- model  =  Policy ()
47- optimizer  =  optim .Adam (model .parameters (), lr = 1e-2 )
46+ policy  =  Policy ()
47+ optimizer  =  optim .Adam (policy .parameters (), lr = 1e-2 )
4848
4949
5050def  select_action (state ):
5151 state  =  torch .from_numpy (state ).float ().unsqueeze (0 )
52-  probs  =  model (Variable (state ))
52+  probs  =  policy (Variable (state ))
5353 action  =  probs .multinomial ()
54-  model .saved_actions .append (action )
54+  policy .saved_actions .append (action )
5555 return  action .data 
5656
5757
5858def  finish_episode ():
5959 R  =  0 
6060 rewards  =  []
61-  for  r  in  model .rewards [::- 1 ]:
61+  for  r  in  policy .rewards [::- 1 ]:
6262 R  =  r  +  args .gamma  *  R 
6363 rewards .insert (0 , R )
6464 rewards  =  torch .Tensor (rewards )
6565 rewards  =  (rewards  -  rewards .mean ()) /  (rewards .std () +  np .finfo (np .float32 ).eps )
66-  for  action , r  in  zip (model .saved_actions , rewards ):
66+  for  action , r  in  zip (policy .saved_actions , rewards ):
6767 action .reinforce (r )
6868 optimizer .zero_grad ()
69-  autograd .backward (model .saved_actions , [None  for  _  in  model .saved_actions ])
69+  autograd .backward (policy .saved_actions , [None  for  _  in  policy .saved_actions ])
7070 optimizer .step ()
71-  del  model .rewards [:]
72-  del  model .saved_actions [:]
71+  del  policy .rewards [:]
72+  del  policy .saved_actions [:]
7373
7474
7575running_reward  =  10 
@@ -80,7 +80,7 @@ def finish_episode():
8080 state , reward , done , _  =  env .step (action [0 ,0 ])
8181 if  args .render :
8282 env .render ()
83-  model .rewards .append (reward )
83+  policy .rewards .append (reward )
8484 if  done :
8585 break 
8686
0 commit comments