digits122
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 9 additions & 8 deletions b/‎README.md‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎const.py‎
Lines changed: 12 additions & 14 deletions b/‎const.py‎
Lines changed: 12 additions & 14 deletions
diff --git a/‎lib/dataset.py‎
Lines changed: 5 additions & 1 deletion b/‎lib/dataset.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎lib/go.py‎
Lines changed: 4 additions & 1 deletion b/‎lib/go.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎lib/gtp.py‎
Lines changed: 5 additions & 5 deletions b/‎lib/gtp.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎lib/play.py‎
Lines changed: 51 additions & 36 deletions b/‎lib/play.py‎
Lines changed: 51 additions & 36 deletions
@@ -2,7 +2,6 @@ __pycache__
 data/*
 Sabaki/
 *.pyc
-test.py
 saved_models/
 *.py.lprof
 pytorch/
 
@@ -6,8 +6,9 @@ Ongoing project.
 
 # TODO (in order of priority)
 
-* Dihedral group of board for more training samples
-* Sample random rotation or reflection in dihedral group during MCTS
+* Optimization ?
+* MCTS
+ * Multithreading of search (cant multiprocess because of virtual loss, but useless in Python) ?
 * File of constants that match the paper constants
 * OGS / KGS API
 * Better Komi ?
@@ -16,14 +17,15 @@ Ongoing project.
 # CURRENTLY DOING
 
 * Brainlag on loss : cross entropy or KLDiv (crossentropy - entropy) ??
-* MCTS
- * Tree search
- * Adaptative temperature (close to 1 during the first 30 moves of self-play, close to 0 after and during evaluation)
- * Dirichlet noise to prior probabilities in the rootnode
- * Multithreading of search (cant multiprocess because of virtual loss, but useless in Python) ?
+* Dihedral group of board for more training samples
+* Sample random rotation or reflection in dihedral group during MCTS
 
 # DONE
 
+* MCTS
+ * Tree search
+ * Dirichlet noise to prior probabilities in the rootnode
+ * Adaptative temperature (either take max or proportionally)
 * Learning without MCTS doesnt seem to work
 * Resume training
 * GTP on trained models (human.py, to plug with Sabaki)
@@ -41,7 +43,6 @@ Ongoing project.
 
 * Compile my own version of Sabaki to watch games automatically while traning
 * Statistics
-* Optimization ?
 * Tromp Taylor scoring ?
 * Resignation ?
 * Training on a big computer / server once everything is ready ?
 
@@ -9,7 +9,7 @@
 DTYPE_FLOAT = torch.cuda.FloatTensor if CUDA else torch.FloatTensor
 DTYPE_LONG = torch.cuda.LongTensor if CUDA else torch.LongTensor
 ## Number of self-play parallel games
-PARRALEL_SELF_PLAY = 1
+PARRALEL_SELF_PLAY = 3
 ## Number of evaluation parralel games 
 PARRALEL_EVAL = 2
 ## MCTS parallel
@@ -21,22 +21,20 @@
 
 ## Size of the Go board
 GOBAN_SIZE = 9
+## Number of move to end a game
+MOVE_LIMIT = GOBAN_SIZE ** 2 * 2.2
 ## Number of last states to keep
 HISTORY = 7
 ## Learning rate
 LR = 0.01
-## Number of epochs
-EPOCHS = 100
 ## Number of MCTS simulation
-MCTS_SIM = 200
-## Temperature
-TEMP = 2
+MCTS_SIM = 5
 ## Exploration constant
 C_PUCT = 0.2
 ## L2 Regularization
-L2_REG = 0.0001
+L2_REG = 0.001
 ## Momentum
-MOMENTUM = 0.92
+MOMENTUM = 0.9
 ## Activate MCTS
 MCTS_FLAG = True
 ## Epsilon for Dirichlet noise
@@ -50,27 +48,27 @@
 ##### SELF-PLAY
 
 ## Number of self-play before training
-SELF_PLAY_MATCH = 40
+SELF_PLAY_MATCH = 2 * PARRALEL_SELF_PLAY
 
 #####
 
 
 ##### TRAINING
 
 ## Number of moves to consider when creating the batch
-MOVES = 10000
+MOVES = 2000
 ## Number of mini-batch before evaluation during training
-BATCH_SIZE = 64
+BATCH_SIZE = 32
 ## Number of channels of the output feature maps
 OUTPLANES_MAP = 10
 ## Shape of the input state
 INPLANES = (HISTORY + 1) * 2 + 1
 ## Probabilities for all moves + pass
 OUTPLANES = (GOBAN_SIZE ** 2) + 1
 ## Number of residual blocks
-BLOCKS = 10
+BLOCKS = 5
 ## Number of training step before evaluating
-TRAIN_STEPS = 200
+TRAIN_STEPS = 400
 ## Optimizer
 ADAM = False
 ## Learning rate annealing factor
@@ -85,7 +83,7 @@
 
 ## Number of matches against its old version to evaluate
 ## the newly trained network
-EVAL_MATCHS = 50
+EVAL_MATCHS = 20
 ## Threshold to keep the new neural net
 EVAL_THRESH = 0.53
 
 
@@ -2,6 +2,7 @@
 from const import *
 import numpy as np
 import timeit
+from . import utils
 
 class SelfPlayDataset(Dataset):
  """
@@ -23,8 +24,11 @@ def __len__(self):
 
 
  def __getitem__(self, idx):
- return self.states[idx], self.plays[idx], \
+ 
+ return utils.sample_rotation(self.states[idx]), self.plays[idx], \
  self.winners[idx]
+ # return self.states[idx], self.plays[idx], \
+ # self.winners[idx]
 
 
  def update(self, game):
 
@@ -90,7 +90,7 @@ def get_legal_moves(self):
 
  for pachi_move in legal_moves:
  move = _coord_to_action(self.board, pachi_move)
- if self.test_move(move):
+ if move != 81 or self.test_move(move):
  final_moves.append(move)
 
  if len(final_moves) == 0:
@@ -172,6 +172,9 @@ def step(self, action):
 
 
  def __deepcopy__(self, memo):
+ """ Used to overwrite the deepcopy implicit method since
+ the board cannot be deepcopied """
+
  cls = self.__class__
  result = cls.__new__(cls)
  memo[id(self)] = result
 
@@ -129,11 +129,11 @@ def __init__(self, game, komi=7.5, board_size=19, version="0.2", name="AlphaGo")
  def send(self, message):
  message_id, command, arguments = parse_message(message)
  if command in self.known_commands:
- try:
-  return format_success(
-  message_id, getattr(self, "cmd_" + command)(arguments))
- except ValueError as exception:
- return format_error(message_id, exception.args[0])
+ # try:
+ return format_success(
+ message_id, getattr(self, "cmd_" + command)(arguments))
+ # except ValueError as exception:
+ #  return format_error(message_id, exception.args[0])
  else:
  return format_error(message_id, "unknown command")
 
 
@@ -46,7 +46,7 @@ def create_matches(player, opponent=None, cores=1, match_number=10):
 
 
 
-def self_play(current_time, ite):
+def self_play(current_time, loaded_version):
  """
  Used to create a learning dataset for the value and policy network.
  Play against itself and backtrack the winner to maximize winner moves
@@ -56,21 +56,25 @@ def self_play(current_time, ite):
  client = MongoClient()
  collection = client.superGo[current_time]
  game_id = 0
- improvements = 1
+ current_version = 1
  player = False
 
  while True:
 
  ## Load the player when restarting traning
- if ite:
- new_player, improvements = load_player(current_time, ite)
+ if loaded_version:
+ new_player, checkpoint = load_player(current_time, 
+ loaded_version)
  game_id = collection.find().count()
- ite = False
+ current_version = checkpoint['version'] + 1
+ loaded_version = False
  else:
- new_player, improvements = get_player(current_time, improvements)
+ new_player, checkpoint = get_player(current_time, current_version)
+ if new_player:
+ current_version = checkpoint['version'] + 1
 
- print("[PLAY] Current improvement level: %d" % improvements)
- if improvements == 1 and not player and not new_player:
+ print("[PLAY] Current improvement level: %d" % current_version)
+ if current_version == 1 and not player and not new_player:
  print("[PLAY] Waiting for first player")
  time.sleep(5)
  continue
@@ -92,7 +96,6 @@ def self_play(current_time, ite):
  game_id += 1
  print("[PLAY] Done fetching")
  queue.close()
- time.sleep(15)
 
 
 def play(player, opponent):
@@ -152,17 +155,11 @@ def __init__(self, player, id, color="black", mcts_flag=MCTS_FLAG, goban_size=GO
  self.id = id + 1
  self.board = self._create_board(color)
  self.player_color = 2 if color == "black" else 1
+ self.mcts = mcts_flag
  if mcts_flag:
- if opponent:
- self.player = (player, MCTS(player, competitive=True))
- self.opponent = (opponent, MCTS(player, competitive=True))
- else:
- self.player = (player, MCTS(player))
- self.opponent = False
- else:
- self.player = (player, False)
- self.opponent = False
- 
+ self.mcts = MCTS()
+ self.player = player
+ self.opponent = opponent
 
  def _create_board(self, color):
  """
@@ -200,28 +197,39 @@ def _get_move(self, board, probas):
  return player_move
 
 
- def _play(self, state, player):
+ def _play(self, state, player, other_pass, competitive=False):
  """ Choose a move depending on MCTS or not """
 
- if player[1]:
- action_scores, action = player[1].search(self.board)
+ if self.mcts:
+ if player.passed is True or other_pass:
+ action_scores = np.zeros((self.goban_size ** 2 + 1,))
+ action_scores[-1] = 1
+ action = self.goban_size ** 2
+ else:
+ action_scores, action = self.mcts.search(self.board, player,\
+ competitive=competitive)
+
+ if action == self.goban_size ** 2:
+ player.passed = True
+ 
  else:
  feature_maps = player.extractor(state)
  probas = player.policy_net(feature_maps)[0] \
  .cpu().data.numpy()
  if player.passed is True:
- player_move = self.goban_size ** 2
+ action = self.goban_size ** 2
  else:
- player_move = self._get_move(self.board, probas)
+ action = self._get_move(self.board, probas)
 
- if player_move == self.goban_size ** 2:
+ if action == self.goban_size ** 2:
  player.passed = True
 
  action_scores = np.zeros((self.goban_size ** 2 + 1),)
- action_scores[player_move] = 1
+ action_scores[action] = 1
 
- state, reward, done = self.board.step(player_move)
- return state, reward, done, action_scores
+ state, reward, done = self.board.step(action)
+ self.board.render()
+ return state, reward, done, action_scores, action
 
 
  def __call__(self):
@@ -235,23 +243,28 @@ def __call__(self):
  state = self.board.reset()
  dataset = []
  moves = 0
+ comp = False
 
  while not done:
  ## Prevent cycling in 2 atari situations
- if moves > 60 * self.goban_size:
+ if moves > MOVE_LIMIT:
  return False
+ 
+ if moves > MOVE_LIMIT / 24:
+ comp = True
 
  ## For evaluation
  if self.opponent:
- state, reward, done, _ = self._play(_prepare_state(state), \
- self.player)
- state, reward, done, _ = self._play(_prepare_state(state), \
- self.opponent)
+ state, reward, done, _, action = self._play(_prepare_state(state), \
+ self.player, self.opponent.passed, competitive=True)
+ state, reward, done, _, action = self._play(_prepare_state(state), \
+ self.opponent, self.player.passed, competitive=True)
  moves += 2
+
  ## For self-play
  else:
  state = _prepare_state(state)
- new_state, reward, done, probas = self._play(state, self.player)
+ new_state, reward, done, probas, _ = self._play(state, self.player, False, competitive=comp)
  self._swap_color()
  dataset.append((state.cpu().data.numpy(), probas, \
  self.player_color))
@@ -260,7 +273,9 @@ def __call__(self):
 
  ## Pickle the result because multiprocessing
  if self.opponent:
+ self.opponent.passed = False
  return pickle.dumps([reward])
+ self.player.passed = False
  return pickle.dumps((dataset, reward))
 
 
@@ -271,9 +286,9 @@ def solo_play(self, move=None):
  ## Agent plays the first move of the game
  if move is None:
  state = _prepare_state(self.board.state)
- state, reward, done, player_move = self._play(state, self.player)
+ state, reward, done, probas, move = self._play(state, self.player)
  self._swap_color()
- return player_move
+ return move
  ## Otherwise just play a move and answer it
  else:
  state, reward, done = self.board.step(move)