ikostrikov
diff --git a/‎layer_norm.py‎
Lines changed: 18 additions & 0 deletions b/‎layer_norm.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎layer_norm_lstm.py‎
Lines changed: 37 additions & 0 deletions b/‎layer_norm_lstm.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎meta_optimizer.py‎
Lines changed: 13 additions & 24 deletions b/‎meta_optimizer.py‎
Lines changed: 13 additions & 24 deletions
diff --git a/‎utils.py‎
Lines changed: 11 additions & 0 deletions b/‎utils.py‎
Lines changed: 11 additions & 0 deletions
@@ -0,0 +1,18 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+
+class LayerNorm1D(nn.Module):
+ def __init__(self, num_outputs, eps=1e-5, affine=True):
+ super(LayerNorm1D, self).__init__()
+ self.eps = eps
+ self.weight = nn.Parameter(torch.ones(1, num_outputs))
+ self.bias = nn.Parameter(torch.zeros(1, num_outputs))
+
+ def forward(self, inputs):
+ input_mean = inputs.mean(1).expand_as(inputs)
+ input_std = inputs.std(1).expand_as(inputs)
+ x = (inputs - input_mean) / (input_std + self.eps)
+ return x * self.weight.expand_as(x) + self.bias.expand_as(x)
@@ -0,0 +1,37 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from layer_norm import LayerNorm1D
+
+
+class LayerNormLSTMCell(nn.Module):
+
+ def __init__(self, num_inputs, num_hidden, forget_gate_bias=-1):
+ super(LayerNormLSTMCell, self).__init__()
+
+ self.forget_gate_bias = forget_gate_bias
+ self.num_hidden = num_hidden
+ self.fc_i2h = nn.Linear(num_inputs, 4 * num_hidden)
+ self.fc_h2h = nn.Linear(num_hidden, 4 * num_hidden)
+
+ self.ln_i2h = LayerNorm1D(4 * num_hidden)
+ self.ln_h2h = LayerNorm1D(4 * num_hidden)
+
+ self.ln_h2o = LayerNorm1D(num_hidden)
+
+ def forward(self, inputs, state):
+ hx, cx = state
+ i2h = self.fc_i2h(inputs)
+ h2h = self.fc_h2h(hx)
+ x = self.ln_i2h(i2h) + self.ln_h2h(h2h)
+ gates = x.split(self.num_hidden, 1)
+
+ in_gate = F.sigmoid(gates[0])
+ forget_gate = F.sigmoid(gates[1] + self.forget_gate_bias)
+ out_gate = F.sigmoid(gates[2])
+ in_transform = F.tanh(gates[3])
+
+ cx = forget_gate * cx + in_gate * in_transform
+ hx = out_gate * F.tanh(self.ln_h2o(cx))
+ return hx, cx
@@ -7,6 +7,9 @@
 import torch.optim as optim
 from torch.autograd import Variable
 import math
+from utils import preprocess_gradients
+from layer_norm_lstm import LayerNormLSTMCell
+from layer_norm import LayerNorm1D
 
 class MetaOptimizer(nn.Module):
 
@@ -16,16 +19,12 @@ def __init__(self, model, num_layers, hidden_size):
 
  self.hidden_size = hidden_size
 
- self.linear1 = nn.Linear(2, hidden_size)
+ self.linear1 = nn.Linear(3, hidden_size)
+ self.ln1 = LayerNorm1D(hidden_size)
 
  self.lstms = []
  for i in range(num_layers):
- self.lstms.append(nn.LSTMCell(hidden_size, hidden_size))
-
- self.lstms[-1].bias_ih.data.fill_(0)
- self.lstms[-1].bias_hh.data.fill_(0)
- self.lstms[-1].bias_hh.data[10:20].fill_(1)
-
+ self.lstms.append(LayerNormLSTMCell(hidden_size, hidden_size))
 
  self.linear2 = nn.Linear(hidden_size, 1)
  self.linear2.weight.data.mul_(0.1)
@@ -53,20 +52,9 @@ def reset_lstm(self, keep_states=False, model=None, use_cuda=False):
  if use_cuda:
  self.hx[i], self.cx[i] = self.hx[i].cuda(), self.cx[i].cuda()
 
- def forward(self, inputs):
- initial_size = inputs.size()
- x = inputs.view(-1, 1)
-
+ def forward(self, x):
  # Gradients preprocessing
- p = 10
- eps = 1e-6
- indicator = (x.abs() > math.exp(-p)).float()
- x1 = (x.abs() + eps).log() / p * indicator - (1 - indicator)
- x2 = x.sign() * indicator + math.exp(p) * x * (1 - indicator)
-
- x = torch.cat((x1, x2), 1)
-
- x = F.tanh(self.linear1(x))
+ x = F.tanh(self.ln1(self.linear1(x)))
 
  for i in range(len(self.lstms)):
  if x.size(0) != self.hx[i].size(0):
@@ -77,8 +65,7 @@ def forward(self, inputs):
  x = self.hx[i]
 
  x = self.linear2(x)
- x = x.view(*initial_size)
- return x
+ return x.squeeze()
 
  def meta_update(self, model_with_grads):
  # First we need to create a flat version of parameters and gradients
@@ -89,10 +76,12 @@ def meta_update(self, model_with_grads):
  grads.append(module._parameters['bias'].grad.data.view(-1))
 
  flat_params = self.meta_model.get_flat_params()
- flat_grads = Variable(torch.cat(grads))
+ flat_grads = preprocess_gradients(torch.cat(grads))
+
+ inputs = Variable(torch.cat((flat_grads, flat_params.data), 1))
 
  # Meta update itself
- flat_params = flat_params + self(flat_grads)
+ flat_params = flat_params + self(inputs)
 
  self.meta_model.set_flat_params(flat_params)
 
 
@@ -0,0 +1,11 @@
+import math
+import torch
+
+def preprocess_gradients(x):
+ p = 10
+ eps = 1e-6
+ indicator = (x.abs() > math.exp(-p)).float()
+ x1 = (x.abs() + eps).log() / p * indicator - (1 - indicator)
+ x2 = x.sign() * indicator + math.exp(p) * x * (1 - indicator)
+
+ return torch.cat((x1, x2), 1)