digits122
diff --git a/‎README.md‎
Lines changed: 4 additions & 2 deletions b/‎README.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎const.py‎
Lines changed: 6 additions & 8 deletions b/‎const.py‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎lib/dataset.py‎
Lines changed: 3 additions & 2 deletions b/‎lib/dataset.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎lib/play.py‎
Lines changed: 13 additions & 31 deletions b/‎lib/play.py‎
Lines changed: 13 additions & 31 deletions
diff --git a/‎lib/train.py‎
Lines changed: 11 additions & 8 deletions b/‎lib/train.py‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎lib/utils.py‎
Lines changed: 25 additions & 0 deletions b/‎lib/utils.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎main.py‎
Lines changed: 2 additions & 2 deletions b/‎main.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎models/mcts.py‎
Lines changed: 24 additions & 3 deletions b/‎models/mcts.py‎
Lines changed: 24 additions & 3 deletions
@@ -6,19 +6,21 @@ Ongoing project.
 
 # TODO (in order of priority)
 
+* Dihedral group of board for more training samples
+* Sample random rotation or reflection in dihedral group during MCTS
 * File of constants that match the paper constants
 * OGS / KGS API
 * Better Komi ?
 * Use logging instead of prints ?
 
 # CURRENTLY DOING
 
+* Brainlag on loss : cross entropy or KLDiv (crossentropy - entropy) ??
 * MCTS
  * Tree search
- * Rotation of board for more training samples
  * Adaptative temperature (close to 1 during the first 30 moves of self-play, close to 0 after and during evaluation)
  * Dirichlet noise to prior probabilities in the rootnode
- * Multiprocessing of search
+ * Multiprocessing of search ?
 
 # DONE
 
 
@@ -8,9 +8,7 @@
 ## Dtype of the tensors depending on CUDA
 DTYPE_FLOAT = torch.cuda.FloatTensor if CUDA else torch.FloatTensor
 DTYPE_LONG = torch.cuda.LongTensor if CUDA else torch.LongTensor
-## Number of process, used for parallel matching atm
 ## Number of self-play parallel games
-# PARRALEL_SELF_PLAY = multiprocessing.cpu_count() - 2
 PARRALEL_SELF_PLAY = 1
 ## Number of evaluation parralel games 
 PARRALEL_EVAL = 2
@@ -40,7 +38,9 @@
 ## Momentum
 MOMENTUM = 0.92
 ## Activate MCTS
-MCTS_FLAG = False
+MCTS_FLAG = True
+## Alpha for Dirichlet noise
+EPS = 0.25
 
 #####
 
@@ -49,8 +49,6 @@
 
 ## Number of self-play before training
 SELF_PLAY_MATCH = 40
-## Number of matches to run per process
-NUM_MATCHES = SELF_PLAY_MATCH // PARRALEL_SELF_PLAY
 
 #####
 
@@ -60,7 +58,7 @@
 ## Number of moves to consider when creating the batch
 MOVES = 10000
 ## Number of mini-batch before evaluation during training
-BATCH_SIZE = 128
+BATCH_SIZE = 64
 ## Number of channels of the output feature maps
 OUTPLANES_MAP = 10
 ## Shape of the input state
@@ -72,10 +70,10 @@
 ## Number of training step before evaluating
 TRAIN_STEPS = 200
 ## Optimizer
-ADAM = True
+ADAM = False
 ## Learning rate annealing factor
 LR_DECAY = 0.1
-## Learning rate anmnealing interval
+## Learning rate annnealing interval
 LR_DECAY_ITE = 50 * TRAIN_STEPS
 
 #####
 
@@ -14,7 +14,7 @@ def __init__(self, mcts_flag=MCTS_FLAG):
 
  self.states = np.zeros((MOVES, (HISTORY + 1) * 2 + 1, GOBAN_SIZE, GOBAN_SIZE))
  self.mcts_flag = mcts_flag
- if not mcts_flag:
+ if mcts_flag:
  self.plays = np.zeros((MOVES, GOBAN_SIZE ** 2 + 1))
  else:
  self.plays = np.zeros(MOVES)
@@ -42,7 +42,7 @@ def update(self, game):
  self.states[:number_moves] = np.vstack(dataset[:,0])
 
  self.plays = np.roll(self.plays, number_moves, axis=0)
- if not self.mcts_flag:
+ if self.mcts_flag:
  self.plays[np.arange(number_moves),np.hstack(dataset[:,1])] = 1
  else:
  self.plays[:number_moves] = np.hstack(dataset[:,1])
@@ -54,6 +54,7 @@ def update(self, game):
  self.winners[:number_moves] = winners
  return number_moves
 
+ 
  def update_batch(self, raw_dataset):
  for game in raw_dataset:
  self.update(game)
@@ -184,24 +184,6 @@ def _swap_color(self):
  else:
  self.player_color = 1
 
-
- def _draw_move(self, action_scores, competitive=False):
- """
- Find the best move, either deterministically for competitive play
- or stochiasticly according to some temperature constant
- """
-
- if competitive:
- move = np.argmax(action_scores)
-
- else:
- action_scores = np.power(action_scores, (1. / TEMP))
- total = np.sum(action_scores)
- probas = action_scores / total
- move = np.random.choice(action_scores.shape[0], p=probas)
-
- return move
- 
 
  def _get_move(self, board, probas):
  """ Select a move without MCTS """
@@ -235,22 +217,22 @@ def _get_move(self, board, probas):
  def _play(self, state, player):
  """ Choose a move depending on MCTS or not """
 
- if not self.mcts_flag:
- action_scores = player.mcts.search()
+ # if self.mcts_flag:
+ # action_scores = player.mcts.search()
+ # else:
+ feature_maps = player.extractor(state)
+ probas = player.policy_net(feature_maps)[0] \
+ .cpu().data.numpy()
+ if player.passed is True:
+ player_move = self.goban_size ** 2
  else:
- feature_maps = player.extractor(state)
- probas = player.policy_net(feature_maps)[0] \
- .cpu().data.numpy()
- if player.passed is True:
- player_move = self.goban_size ** 2
- else:
- player_move = self._get_move(self.board, probas)
+ player_move = self._get_move(self.board, probas)
 
-  if player_move == self.goban_size ** 2:
-  player.passed = True
+ if player_move == self.goban_size ** 2:
+ player.passed = True
 
-  state, reward, done = self.board.step(player_move)
-  return state, reward, done, player_move
+ state, reward, done = self.board.step(player_move)
+ return state, reward, done, player_move
 
 
  def __call__(self):
 
@@ -1,4 +1,5 @@
 import torch
+import torch.nn as nn 
 import numpy as np
 import pickle
 import time
@@ -20,16 +21,18 @@
 class AlphaLoss(torch.nn.Module):
  """ Custom loss as defined in the paper """
 
- def __init__(self, mcts_flag=MCTS_FLAG):
- self.mcts_flag = mcts_flag
+ def __init__(self):
  super(AlphaLoss, self).__init__()
+ self.log_softmax = nn.LogSoftmax()
 
+ # def forward(self, winner, self_play_winner, probas, self_play_probas):
+ # value_error = F.mse_loss(winner, self_play_winner)
+ # policy_error = torch.mean(torch.sum(-self_play_probas * self.log_softmax(probas), 1))
+ # return value_error + policy_error
+ 
  def forward(self, winner, self_play_winner, probas, self_play_probas):
  value_error = F.mse_loss(winner, self_play_winner)
- if not self.mcts_flag:
- policy_error = F.binary_cross_entropy(probas, self_play_probas)
- else:
- policy_error = F.cross_entropy(probas, self_play_probas)
+ policy_error = F.kl_div(probas, self_play_probas)
  return value_error + policy_error
 
 
@@ -167,11 +170,11 @@ def new_agent(result):
  except KeyboardInterrupt:
  client.close()
  pool.terminate()
-
+ 
  example = {
  'state': Variable(state).type(DTYPE_FLOAT),
  'winner': Variable(winner).type(DTYPE_FLOAT),
- 'move' : Variable(move).type(DTYPE_FLOAT if not MCTS_FLAG else DTYPE_LONG)
+ 'move' : Variable(move).type(DTYPE_FLOAT if MCTS_FLAG else DTYPE_LONG)
  }
  loss = train_epoch(new_player, optimizer, example, criterion)
 
 
@@ -1,5 +1,8 @@
 import os
 from models.agent import Player
+import numpy as np
+from const import *
+import random
 
 
 def get_ite(folder_path, ite):
@@ -62,3 +65,25 @@ def get_player(current_time, improvements):
  player = Player()
  player.load_models(path, models)
  return player, improvements + 1
+
+
+def sample_rotation(state, num=8):
+ dh_group = [(0, 0) (np.rot90, 1), (np.rot90, 2), (np.rot90, 3), 
+ (np.fliplr, 0), (np.flipud, 0), (np.flipud, (np.rot90, 1)), (np.fliplr, (np.rot90, 1))]
+
+ dh_group = random.shuffle(dh_group)
+ states = []
+ for i in num:
+ print(i)
+ assert 0
+ 
+ return state
+
+
+if __name__ == "__main__":
+ pass
+
+
+
+
+
@@ -23,8 +23,8 @@ def main(folder, ite):
  try:
  x = pool.apply_async(self_play, args=(current_time, ite,))
  y = pool.apply_async(train, args=(current_time, ite,))
- x.get()
- # y.get()
+ # x.get()
+ y.get()
  except KeyboardInterrupt:
  pool.terminate()
  else:
 
@@ -8,6 +8,9 @@ def __init__(self, move, probas):
  self.n = 0
  self.w = 0
  self.q = 0
+ 
+ def expand(self):
+ pass
 
 
 class MCTS():
@@ -19,6 +22,24 @@ def __init__(self, c_puct, extractor, value_net, policy_net):
  self.c_puct = c_puct
 
 
+ def _draw_move(self, action_scores, competitive=False):
+ """
+ Find the best move, either deterministically for competitive play
+ or stochiasticly according to some temperature constant
+ """
+
+ if competitive:
+ move = np.argmax(action_scores)
+
+ else:
+ action_scores = np.power(action_scores, (1. / TEMP))
+ total = np.sum(action_scores)
+ probas = action_scores / total
+ move = np.random.choice(action_scores.shape[0], p=probas)
+
+ return move
+
+
  def _puct(self, proba, total_count, count):
  """
  Function of P and N that increases if an action hasn't been explored
@@ -30,7 +51,7 @@ def _puct(self, proba, total_count, count):
  return action_score
 
 
- def select(self, nodes):
+ def _select(self, nodes):
  """
  Select the move that maximises the mean value of the next state +
  the result of the PUCT function
@@ -45,8 +66,8 @@ def select(self, nodes):
 
  return max(action_scores)
 
- def search(self, game):
-  x = random.choice(actions)
+
+ def search(self, game, competitive=False):
  return x