@@ -46,7 +46,7 @@ def create_matches(player, opponent=None, cores=1, match_number=10):
4646
4747
4848
49- def self_play (current_time , ite ):
49+ def self_play (current_time , loaded_version ):
5050 """
5151 Used to create a learning dataset for the value and policy network.
5252 Play against itself and backtrack the winner to maximize winner moves
@@ -56,21 +56,25 @@ def self_play(current_time, ite):
5656 client = MongoClient ()
5757 collection = client .superGo [current_time ]
5858 game_id = 0
59- improvements = 1
59+ current_version = 1
6060 player = False
6161
6262 while True :
6363
6464 ## Load the player when restarting traning
65- if ite :
66- new_player , improvements = load_player (current_time , ite )
65+ if loaded_version :
66+ new_player , checkpoint = load_player (current_time ,
67+ loaded_version )
6768 game_id = collection .find ().count ()
68- ite = False
69+ current_version = checkpoint ['version' ] + 1
70+ loaded_version = False
6971 else :
70- new_player , improvements = get_player (current_time , improvements )
72+ new_player , checkpoint = get_player (current_time , current_version )
73+ if new_player :
74+ current_version = checkpoint ['version' ] + 1
7175
72- print ("[PLAY] Current improvement level: %d" % improvements )
73- if improvements == 1 and not player and not new_player :
76+ print ("[PLAY] Current improvement level: %d" % current_version )
77+ if current_version == 1 and not player and not new_player :
7478 print ("[PLAY] Waiting for first player" )
7579 time .sleep (5 )
7680 continue
@@ -92,7 +96,6 @@ def self_play(current_time, ite):
9296 game_id += 1
9397 print ("[PLAY] Done fetching" )
9498 queue .close ()
95- time .sleep (15 )
9699
97100
98101def play (player , opponent ):
@@ -152,17 +155,11 @@ def __init__(self, player, id, color="black", mcts_flag=MCTS_FLAG, goban_size=GO
152155 self .id = id + 1
153156 self .board = self ._create_board (color )
154157 self .player_color = 2 if color == "black" else 1
158+ self .mcts = mcts_flag
155159 if mcts_flag :
156- if opponent :
157- self .player = (player , MCTS (player , competitive = True ))
158- self .opponent = (opponent , MCTS (player , competitive = True ))
159- else :
160- self .player = (player , MCTS (player ))
161- self .opponent = False
162- else :
163- self .player = (player , False )
164- self .opponent = False
165-
160+ self .mcts = MCTS ()
161+ self .player = player
162+ self .opponent = opponent
166163
167164 def _create_board (self , color ):
168165 """
@@ -200,28 +197,39 @@ def _get_move(self, board, probas):
200197 return player_move
201198
202199
203- def _play (self , state , player ):
200+ def _play (self , state , player , other_pass , competitive = False ):
204201 """ Choose a move depending on MCTS or not """
205202
206- if player [1 ]:
207- action_scores , action = player [1 ].search (self .board )
203+ if self .mcts :
204+ if player .passed is True or other_pass :
205+ action_scores = np .zeros ((self .goban_size ** 2 + 1 ,))
206+ action_scores [- 1 ] = 1
207+ action = self .goban_size ** 2
208+ else :
209+ action_scores , action = self .mcts .search (self .board , player ,\
210+ competitive = competitive )
211+
212+ if action == self .goban_size ** 2 :
213+ player .passed = True
214+
208215 else :
209216 feature_maps = player .extractor (state )
210217 probas = player .policy_net (feature_maps )[0 ] \
211218 .cpu ().data .numpy ()
212219 if player .passed is True :
213- player_move = self .goban_size ** 2
220+ action = self .goban_size ** 2
214221 else :
215- player_move = self ._get_move (self .board , probas )
222+ action = self ._get_move (self .board , probas )
216223
217- if player_move == self .goban_size ** 2 :
224+ if action == self .goban_size ** 2 :
218225 player .passed = True
219226
220227 action_scores = np .zeros ((self .goban_size ** 2 + 1 ),)
221- action_scores [player_move ] = 1
228+ action_scores [action ] = 1
222229
223- state , reward , done = self .board .step (player_move )
224- return state , reward , done , action_scores
230+ state , reward , done = self .board .step (action )
231+ self .board .render ()
232+ return state , reward , done , action_scores , action
225233
226234
227235 def __call__ (self ):
@@ -235,23 +243,28 @@ def __call__(self):
235243 state = self .board .reset ()
236244 dataset = []
237245 moves = 0
246+ comp = False
238247
239248 while not done :
240249 ## Prevent cycling in 2 atari situations
241- if moves > 60 * self . goban_size :
250+ if moves > MOVE_LIMIT :
242251 return False
252+
253+ if moves > MOVE_LIMIT / 24 :
254+ comp = True
243255
244256 ## For evaluation
245257 if self .opponent :
246- state , reward , done , _ = self ._play (_prepare_state (state ), \
247- self .player )
248- state , reward , done , _ = self ._play (_prepare_state (state ), \
249- self .opponent )
258+ state , reward , done , _ , action = self ._play (_prepare_state (state ), \
259+ self .player , self . opponent . passed , competitive = True )
260+ state , reward , done , _ , action = self ._play (_prepare_state (state ), \
261+ self .opponent , self . player . passed , competitive = True )
250262 moves += 2
263+
251264 ## For self-play
252265 else :
253266 state = _prepare_state (state )
254- new_state , reward , done , probas = self ._play (state , self .player )
267+ new_state , reward , done , probas , _ = self ._play (state , self .player , False , competitive = comp )
255268 self ._swap_color ()
256269 dataset .append ((state .cpu ().data .numpy (), probas , \
257270 self .player_color ))
@@ -260,7 +273,9 @@ def __call__(self):
260273
261274 ## Pickle the result because multiprocessing
262275 if self .opponent :
276+ self .opponent .passed = False
263277 return pickle .dumps ([reward ])
278+ self .player .passed = False
264279 return pickle .dumps ((dataset , reward ))
265280
266281
@@ -271,9 +286,9 @@ def solo_play(self, move=None):
271286 ## Agent plays the first move of the game
272287 if move is None :
273288 state = _prepare_state (self .board .state )
274- state , reward , done , player_move = self ._play (state , self .player )
289+ state , reward , done , probas , move = self ._play (state , self .player )
275290 self ._swap_color ()
276- return player_move
291+ return move
277292 ## Otherwise just play a move and answer it
278293 else :
279294 state , reward , done = self .board .step (move )
0 commit comments