mattmacy
diff --git a/‎test/test_nn.py‎
Lines changed: 25 additions & 24 deletions b/‎test/test_nn.py‎
Lines changed: 25 additions & 24 deletions
diff --git a/‎torch/nn/functions/rnn.py‎
Lines changed: 3 additions & 3 deletions b/‎torch/nn/functions/rnn.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎torch/nn/modules/rnn.py‎
Lines changed: 159 additions & 2 deletions b/‎torch/nn/modules/rnn.py‎
Lines changed: 159 additions & 2 deletions
@@ -620,55 +620,56 @@ def test_MaxUnpool2d_output_size(self):
 
  @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
  def test_RNN_cpu_vs_cudnn(self):
- 
- def forward_backward(cuda, mode, bias, input_val, hx_val, weights_val):
- rnn = nn.RNNBase(mode, input_size, hidden_size, num_layers, bias=bias)
+
+ def forward_backward(cuda, module, bias, input_val, hx_val, weights_val):
+ rnn = module(input_size, hidden_size, num_layers, bias=bias)
+ is_lstm = module == nn.LSTM
 
  for x_layer, y_layer in zip(rnn.all_weights, weights_val):
  for x, y in zip(x_layer, y_layer):
  x.data.copy_(y.data)
 
  input = Variable(input_val.clone(), requires_grad=True)
- if mode == 'LSTM':
+ if is_lstm:
  hx = (Variable(hx_val.clone(), requires_grad=True),
  Variable(hx_val.add(1), requires_grad=True))
  else:
  hx = Variable(hx_val.clone(), requires_grad=True)
- 
+
  if cuda:
  rnn.cuda()
  input.data = input.data.cuda()
- if mode == 'LSTM':
+ if is_lstm:
  hx[0].data = hx[0].data.cuda()
  hx[1].data = hx[1].data.cuda()
  else:
  hx.data = hx.data.cuda()
 
  output, hy = rnn(input, hx)
  # FIXME this is because of a pytorch bug
- if mode == 'LSTM':
+ if is_lstm:
  fake_loss = 0*(hy[0] + hy[1]).sum()
  else:
  fake_loss = 0*hy.sum()
- 
+
  loss = output.sum() + fake_loss
  loss.backward()
- 
+
  return {'output': output.data,
- 'hy': hy[0].data if mode == 'LSTM' else hy.data,
+ 'hy': hy[0].data if is_lstm else hy.data,
  'weights': rnn.all_weights,
  'grad_input': input.grad,
- 'grad_hx': hx[0].grad if mode == 'LSTM' else hx.grad,
- 'cy': hy[1].data if mode == 'LSTM' else None,
- 'grad_cx': hx[1].grad if mode == 'LSTM' else None}
- 
+ 'grad_hx': hx[0].grad if is_lstm else hx.grad,
+ 'cy': hy[1].data if is_lstm else None,
+ 'grad_cx': hx[1].grad if is_lstm else None}
+
  def diff(t_cpu, t_gpu, name):
  self.assertTrue(torch.is_tensor(t_cpu))
  self.assertTrue(torch.is_tensor(t_gpu))
  delta = t_gpu.cpu().add(-1, t_cpu).abs().max()
  # print("{:30s} cpu: {:10g} gpu: {:10g} diff: {:10g}".format(name, t_cpu.abs().max(), t_gpu.abs().max(), delta))
  self.assertLess(delta, 2 * PRECISION)
- 
+
  input_size = 10
  hidden_size = 20
  num_layers = 2
@@ -677,27 +678,27 @@ def diff(t_cpu, t_gpu, name):
 
  # FIXME: we can't use torch.cuda.DoubleTensor because sum() is not yet defined on it
  with set_default_tensor_type('torch.FloatTensor'):
- for mode in ("RNN_RELU", "RNN_TANH", "GRU", "LSTM"):
+ for module in (nn.RNN, nn.RNNReLU, nn.LSTM, nn.GRU):
  for bias in (True, False):
  input_val = torch.randn(seq_length, batch, input_size)
  hx_val = torch.randn(num_layers, batch, hidden_size)
- 
- weights_val = nn.RNNBase(mode, input_size, hidden_size, num_layers).all_weights
- 
- outputs_cpu = forward_backward(False, mode, bias, input_val, hx_val, weights_val)
- outputs_gpu = forward_backward(True, mode, bias, input_val, hx_val, weights_val)
- 
+
+ weights_val = module(input_size, hidden_size, num_layers).all_weights
+
+ outputs_cpu = forward_backward(False, module, bias, input_val, hx_val, weights_val)
+ outputs_gpu = forward_backward(True, module, bias, input_val, hx_val, weights_val)
+
  diff(outputs_cpu['output'], outputs_gpu['output'], 'output')
  diff(outputs_cpu['hy'], outputs_gpu['hy'], 'hy')
  diff(outputs_cpu['grad_input'], outputs_gpu['grad_input'], 'grad_input')
  diff(outputs_cpu['grad_hx'], outputs_gpu['grad_hx'], 'grad_hx')
  if outputs_cpu['cy'] is not None:
  diff(outputs_cpu['cy'], outputs_gpu['cy'], 'cy')
  diff(outputs_cpu['grad_cx'], outputs_gpu['grad_cx'], 'grad_cx')
- 
+
  for i, (cpu_layer_weight, gpu_layer_weight) in enumerate(zip(outputs_cpu['weights'], outputs_gpu['weights'])):
  for j, (cpu_weight, gpu_weight) in enumerate(zip(cpu_layer_weight, gpu_layer_weight)):
- diff(cpu_weight.grad, gpu_weight.grad, mode + ' grad_weight[{},{}]'.format(i, j))
+ diff(cpu_weight.grad, gpu_weight.grad, 'grad_weight[{},{}]'.format(i, j))
 
 
 def add_test(test):
 
@@ -77,9 +77,9 @@ def GRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
  # this is a bit weird, it doesn't match the order of parameters
  # implied by the cudnn docs, and it also uses nexth for output...
  resetgate = sigmoid(gi[:,0*hsz:1*hsz] + gh[:,0*hsz:1*hsz])
- updategate = sigmoid(gi[:,1*hsz:2*hsz] + gh[:,1*hsz:2*hsz])
- output  = tanh(gi[:,2*hsz:3*hsz] + resetgate * gh[:,2*hsz:3*hsz])
- nexth = output + updategate * (hidden - output)
+ inputgate = sigmoid(gi[:,1*hsz:2*hsz] + gh[:,1*hsz:2*hsz])
+ newgate = tanh(gi[:,2*hsz:3*hsz] + resetgate * gh[:,2*hsz:3*hsz])
+ nexth = newgate + inputgate * (hidden - newgate)
 
  return nexth, nexth # FIXME: nexth, nexth ???
 
 
@@ -7,7 +7,6 @@
 
 
 class RNNBase(Module):
- # FIXME: docstring
 
  def __init__(self, mode, input_size, hidden_size,
  num_layers=1, bias=True, batch_first=False, dropout=0):
@@ -22,7 +21,6 @@ def __init__(self, mode, input_size, hidden_size,
  self.all_weights = []
  super_weights = {}
  for layer in range(num_layers):
- # FIXME: sizes are different for LSTM/GRU
  layer_input_size = input_size if layer == 0 else hidden_size
  if mode == 'LSTM':
  gate_size = 4 * hidden_size
@@ -73,17 +71,176 @@ def forward(self, input, hx):
 
 
 class RNN(RNNBase):
+ """Applies a multi-layer RNN with tanh non-linearity to an input sequence.
+
+
+ For each element in the input sequence, each layer computes the following
+ function:
+ ```
+ h_t = tanh(w_ih * x_t + b_ih + w_hh * h_(t-1) + b_hh)
+ ```
+ where `h_t` is the hidden state at time t, and `x_t` is the hidden
+ state of the previous layer at time t or `input_t` for the first layer.
+
+ Args:
+ input_size: The number of expected features in the input x
+ hidden_size: The number of features in the hidden state h
+ num_layers: the size of the convolving kernel.
+ bias: If False, then the layer does not use bias weights b_ih and b_hh (default=True).
+ batch_first: If True, then the input tensor is provided as (batch, seq, feature)
+ dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer
+ Input: input, h_0
+ input: A (seq_len x batch x input_size) tensor containing the features of the input sequence.
+ h_0: A (num_layers x batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
+ Output: output, h_n
+ output: A (seq_len x batch x hidden_size) tensor containing the output features (h_k) from the last layer of the RNN, for each k
+ h_n: A (num_layers x batch x hidden_size) tensor containing the hidden state for k=seq_len
+ Members:
+ weight_ih_l[k]: the learnable input-hidden weights of the k-th layer, of shape (input_size x hidden_size)
+ weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer, of shape (hidden_size x hidden_size)
+ bias_ih_l[k]: the learnable input-hidden bias of the k-th layer, of shape (hidden_size)
+ bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer, of shape (hidden_size)
+ Examples:
+ >>> rnn = nn.RNN(10, 20, 2)
+ >>> input = Variable(torch.randn(5, 3, 10))
+ >>> h0 = Variable(torch.randn(2, 3, 20))
+ >>> output, hn = rnn(input, h0)
+ """
+
  def __init__(self, *args, **kwargs):
  super(RNN, self).__init__('RNN_TANH', *args, **kwargs)
 
 class RNNReLU(RNNBase):
+ """Applies a multi-layer RNN with ReLU non-linearity to an input sequence.
+
+
+ For each element in the input sequence, each layer computes the following
+ function:
+ ```
+ h_t = ReLU(w_ih x_t + b_ih + w_hh h_(t-1) + b_hh)
+ ```
+ where `h_t` is the hidden state at time t, and `x_t` is the hidden
+ state of the previous layer at time t or `input_t` for the first layer.
+
+ Args:
+ input_size: The number of expected features in the input x
+ hidden_size: The number of features in the hidden state h
+ num_layers: the size of the convolving kernel.
+ bias: If False, then the layer does not use bias weights b_ih and b_hh (default=True).
+ batch_first: If True, then the input tensor is provided as (batch, seq, feature)
+ dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer
+ Input: input, h_0
+ input: A (seq_len x batch x input_size) tensor containing the features of the input sequence.
+ h_0: A (num_layers x batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
+ Output: output, h_n
+ output: A (seq_len x batch x hidden_size) tensor containing the output features (h_k) from the last layer of the RNN, for each k
+ h_n: A (num_layers x batch x hidden_size) tensor containing the hidden state for k=seq_len
+ Members:
+ weight_ih_l[k]: the learnable input-hidden weights of the k-th layer, of shape (input_size x hidden_size)
+ weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer, of shape (hidden_size x hidden_size)
+ bias_ih_l[k]: the learnable input-hidden bias of the k-th layer, of shape (hidden_size)
+ bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer, of shape (hidden_size)
+ Examples:
+ >>> rnn = nn.RNNReLU(10, 20, 2)
+ >>> input = Variable(torch.randn(5, 3, 10))
+ >>> h0 = Variable(torch.randn(2, 3, 20))
+ >>> output, hn = rnn(input, h0)
+ """
+
  def __init__(self, *args, **kwargs):
  super(RNNReLU, self).__init__('RNN_RELU', *args, **kwargs)
 
 class LSTM(RNNBase):
+ """Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.
+
+
+ For each element in the input sequence, each layer computes the following
+ function:
+ ```
+ i_t = sigmoid(W_ii x_t + b_ii + W_hi h_(t-1) + b_hi)
+ f_t = sigmoid(W_if x_t + b_if + W_hf h_(t-1) + b_hf)
+ g_t = tanh(W_ig x_t + b_ig + W_hc h_(t-1) + b_hg)
+ o_t = sigmoid(W_io x_t + b_io + W_ho h_(t-1) + b_ho)
+ c_t = f_t * c_(t-1) + i_t * c_t
+ h_t = o_t * tanh(c_t)
+ ```
+ where `h_t` is the hidden state at time t, `c_t` is the cell state at time t,
+ `x_t` is the hidden state of the previous layer at time t or input_t for the first layer,
+ and `i_t`, `f_t`, `g_t`, `o_t` are the input, forget, cell, and out gates, respectively.
+
+ Args:
+ input_size: The number of expected features in the input x
+ hidden_size: The number of features in the hidden state h
+ num_layers: the size of the convolving kernel.
+ bias: If False, then the layer does not use bias weights b_ih and b_hh (default=True).
+ batch_first: If True, then the input tensor is provided as (batch, seq, feature)
+ dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer
+ Input: input, (h_0, c_0)
+ input: A (seq_len x batch x input_size) tensor containing the features of the input sequence.
+ h_0: A (num_layers x batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
+ c_0: A (num_layers x batch x hidden_size) tensor containing the initial cell state for each element in the batch.
+ Output: output, (h_n, c_n)
+ output: A (seq_len x batch x hidden_size) tensor containing the output features (h_t) from the last layer of the RNN, for each t
+ h_n: A (num_layers x batch x hidden_size) tensor containing the hidden state for t=seq_len
+ c_n: A (num_layers x batch x hidden_size) tensor containing the cell state for t=seq_len
+ Members:
+ weight_ih_l[k]: the learnable input-hidden weights of the k-th layer (W_ir|W_ii|W_in), of shape (input_size x 3*hidden_size)
+ weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer (W_hr|W_hi|W_hn), of shape (hidden_size x 3*hidden_size)
+ bias_ih_l[k]: the learnable input-hidden bias of the k-th layer (b_ir|b_ii|b_in), of shape (3*hidden_size)
+ bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer (W_hr|W_hi|W_hn), of shape (3*hidden_size)
+ Examples:
+ >>> rnn = nn.LSTM(10, 20, 2)
+ >>> input = Variable(torch.randn(5, 3, 10))
+ >>> h0 = Variable(torch.randn(2, 3, 20))
+ >>> c0 = Variable(torch.randn(2, 3, 20))
+ >>> output, hn = rnn(input, (h0, c0))
+ """
  def __init__(self, *args, **kwargs):
  super(LSTM, self).__init__('LSTM', *args, **kwargs)
 
 class GRU(RNNBase):
+ """Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
+
+
+ For each element in the input sequence, each layer computes the following
+ function:
+ ```
+ r_t = sigmoid(W_ir x_t + b_ir + W_hr h_(t-1) + b_hr)
+ i_t = sigmoid(W_ii x_t + b_ii + W_hi h_(t-1) + b_hi)
+ n_t = tanh(W_in x_t + resetgate * W_hn h_(t-1))
+ h_t = (1 - i_t) * n_t + i_t * h_(t-1)
+ ```
+ where `h_t` is the hidden state at time t, `x_t` is the hidden
+ state of the previous layer at time t or input_t for the first layer,
+ and `r_t`, `i_t`, `n_t` are the reset, input, and new gates, respectively.
+
+ Args:
+ input_size: The number of expected features in the input x
+ hidden_size: The number of features in the hidden state h
+ num_layers: the size of the convolving kernel.
+ bias: If False, then the layer does not use bias weights b_ih and b_hh (default=True).
+ batch_first: If True, then the input tensor is provided as (batch, seq, feature)
+ dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer
+ Input: input, h_0
+ input: A (seq_len x batch x input_size) tensor containing the features of the input sequence.
+ h_0: A (num_layers x batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
+ Output: output, h_n
+ output: A (seq_len x batch x hidden_size) tensor containing the output features (h_t) from the last layer of the RNN, for each t
+ h_n: A (num_layers x batch x hidden_size) tensor containing the hidden state for t=seq_len
+ Members:
+ weight_ih_l[k]: the learnable input-hidden weights of the k-th layer (W_ir|W_ii|W_in), of shape (input_size x 3*hidden_size)
+ weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer (W_hr|W_hi|W_hn), of shape (hidden_size x 3*hidden_size)
+ bias_ih_l[k]: the learnable input-hidden bias of the k-th layer (b_ir|b_ii|b_in), of shape (3*hidden_size)
+ bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer (W_hr|W_hi|W_hn), of shape (3*hidden_size)
+ Examples:
+ >>> rnn = nn.GRU(10, 20, 2)
+ >>> input = Variable(torch.randn(5, 3, 10))
+ >>> h0 = Variable(torch.randn(2, 3, 20))
+ >>> output, hn = rnn(input, h0)
+ """
+
  def __init__(self, *args, **kwargs):
  super(GRU, self).__init__('GRU', *args, **kwargs)
+
+
+# FIXME: add module wrappers around XXXCell, and maybe StackedRNN and Recurrent