chenyangguang
diff --git a/‎test/test_nn.py‎
Lines changed: 63 additions & 19 deletions b/‎test/test_nn.py‎
Lines changed: 63 additions & 19 deletions
diff --git a/‎torch/cuda/comm.py‎
Lines changed: 1 addition & 0 deletions b/‎torch/cuda/comm.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/nn/parallel/data_parallel.py‎
Lines changed: 57 additions & 14 deletions b/‎torch/nn/parallel/data_parallel.py‎
Lines changed: 57 additions & 14 deletions
diff --git a/‎torch/nn/parallel/parallel_apply.py‎
Lines changed: 11 additions & 6 deletions b/‎torch/nn/parallel/parallel_apply.py‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎torch/nn/parallel/scatter_gather.py‎
Lines changed: 21 additions & 9 deletions b/‎torch/nn/parallel/scatter_gather.py‎
Lines changed: 21 additions & 9 deletions
diff --git a/‎torch/tensor.py‎
Lines changed: 2 additions & 2 deletions b/‎torch/tensor.py‎
Lines changed: 2 additions & 2 deletions
@@ -259,7 +259,8 @@ def bw_hook(inc, h_module, grad_input, grad_output):
  self.assertEqual(counter['forwards'], 2)
  self.assertEqual(counter['backwards'], 0)
 
- test_bwd = module.register_backward_hook(lambda *args: bw_hook(1, *args))
+ test_bwd = module.register_backward_hook(
+ lambda *args: bw_hook(1, *args))
 
  output = module(input)
  self.assertEqual(counter['forwards'], 3)
@@ -816,7 +817,8 @@ def test_parallel_apply(self):
  inputs = ((i1,), (i2,))
  modules = (l1, l2)
  expected_outputs = (expected1, expected2)
- outputs = dp.parallel_apply(modules, inputs)
+
+ outputs = dp.parallel_apply(modules, inputs, None)
  for out, expected in zip(outputs, expected_outputs):
  self.assertEqual(out.data, expected)
 
@@ -833,27 +835,67 @@ def test_data_parallel_noop(self):
  @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
  def test_data_parallel_multiple_input(self):
  class TestModule(nn.Module):
- def forward(self, x, y):
- return x + y
-
- m = TestModule()
- x = Variable(torch.randn(5, 5).float())
- y = Variable(torch.randn(5, 5).float())
- expected = m(x, y)
 
- out = dp.data_parallel(m, (x, y), (0, 1))
- self.assertEqual(out, expected)
+ def forward(self, var1, var2, float1, var3=None):
+ if var3 is None:
+ return float1 * (var1 * var2)
+ else:
+ return float1 * (var1 * var2 + var3)
 
- out = dp.data_parallel(m, (x, y), (0,))
- self.assertEqual(out, expected)
+ m = TestModule()
+ var1 = Variable(torch.randn(5, 5).float(), requires_grad=True)
+ var2 = Variable(torch.randn(5, 5).float(), requires_grad=True)
+ var3 = Variable(torch.randn(5, 5).float(), requires_grad=False)
+
+ float1 = torch.randn(1)[0]
+ target = Variable(torch.randn(5, 5).float()).cuda()
+ crit = nn.MSELoss()
+
+ expected = m(var1, var2, float1)
+ loss = expected.sum()
+ loss.backward()
+ gvar1_exp = var1.grad.clone()
+ gvar2_exp = var2.grad.clone()
+
+ def local_test(out):
+ var1.grad.data.fill_(0.0)
+ var2.grad.data.fill_(0.0)
+ loss = out.sum()
+ loss.backward()
+ self.assertEqual(out, expected)
+ self.assertEqual(gvar1_exp, var1.grad)
+ self.assertEqual(gvar2_exp, var2.grad)
+
+ out = dp.data_parallel(m, (var1, var2, float1), (0, 1))
+ local_test(out)
+
+ out = dp.data_parallel(m, (var1, var2, float1), (0,))
+ local_test(out)
+
+ var1.grad.data.fill_(0.0)
+ var2.grad.data.fill_(0.0)
+ expected = m(var1, var2, float1, var3=var3)
+ loss = expected.sum()
+ loss.backward()
+ gvar1_exp = var1.grad.clone()
+ gvar2_exp = var2.grad.clone()
 
  dpm = nn.DataParallel(TestModule())
- out = dpm(x, y)
- self.assertEqual(out, expected)
+ out = dpm(var1, var2, float1, var3=var3)
+ local_test(out)
 
  dpm = nn.DataParallel(TestModule(), device_ids=[0])
- out = dpm(x, y)
- self.assertEqual(out, expected)
+ out = dpm(var1, var2, float1, var3=var3)
+ local_test(out)
+
+ kwarg_wrap = {'var3': var3}
+ out = dp.data_parallel(
+ m, (var1, var2, float1), (0, 1), module_kwargs=kwarg_wrap)
+ local_test(out)
+
+ out = dp.data_parallel(
+ m, (var1, var2, float1), (0,), module_kwargs=kwarg_wrap)
+ local_test(out)
 
  @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
  def test_data_parallel_small_back(self):
@@ -1426,8 +1468,10 @@ def compare_cpu_gpu(outputs_cpu, outputs_gpu):
  for nonlinearity in ('tanh', 'relu'):
  hx_val = torch.randn(num_layers, batch, hidden_size)
  input_val = torch.randn(seq_length, batch, input_size)
- grad_output = torch.randn(seq_length, batch, hidden_size * num_directions)
- grad_hy = torch.randn(num_layers * num_directions, batch, hidden_size)
+ grad_output = torch.randn(
+ seq_length, batch, hidden_size * num_directions)
+ grad_hy = torch.randn(
+ num_layers * num_directions, batch, hidden_size)
 
  rnn = nn.RNN(input_size, hidden_size, num_layers, bias=bias, nonlinearity=nonlinearity)
  outputs_cpu = forward_backward(False, rnn, input_val, hx_val, grad_output, grad_hy, rnn.all_weights)
 
@@ -94,6 +94,7 @@ def scatter(tensor, devices, chunk_sizes=None, dim=0):
  assert min(chunk_sizes) > 0, "got a negative chunk_size"
  chunks = [tensor.narrow(dim, start - size, size)
  for start, size in zip(_accumulate(chunk_sizes), chunk_sizes)]
+ chunks = tuple(chunk.contiguous() for chunk in chunks)
  # TODO: copy to a pinned buffer first (if copying from CPU)
  return tuple(chunk.cuda(gpu_id, async=chunk.is_contiguous())
  for gpu_id, chunk in zip(devices, chunks))
 
@@ -7,6 +7,7 @@
 
 
 class DataParallel(Module):
+
  """Implements data parallelism at the module level.
 
  This container parallelizes the application of the given module by
@@ -21,6 +22,12 @@ class DataParallel(Module):
 
  See also: :ref:`cuda-nn-dataparallel-instead`
 
+ Arbitrary positional and keyword inputs are allowed to be passed into
+ DataParallel EXCEPT Tensors. All variables will be scattered on dim
+ specified (default 0). Primitive types will be broadcasted, but all
+ other types will be a shallow copy and can be corrupted if written to in
+ the model's forward pass.
+
  Args:
  module: module to be parallelized
  device_ids: CUDA devices (default: all devices)
@@ -36,48 +43,70 @@ class DataParallel(Module):
  """
 
  # TODO: update notes/cuda.rst when this class handles 8+ GPUs well
- def __init__(self, module, device_ids=None, output_device=None):
+
+ def __init__(self, module, device_ids=None, output_device=None, dim=0):
  super(DataParallel, self).__init__()
  if device_ids is None:
  device_ids = list(range(torch.cuda.device_count()))
  if output_device is None:
  output_device = device_ids[0]
+ self.dim = dim
  self.module = module
  self.device_ids = device_ids
  self.output_device = output_device
  if len(self.device_ids) == 1:
  self.module.cuda(device_ids[0])
 
- def forward(self, *inputs):
+ def forward(self, *inputs, **kwargs):
  def _to_cuda(obj):
  if isinstance(obj, Variable):
  return obj.cuda()
- return tuple((map(_to_cuda, obj)))
+ if isinstance(obj, tuple) or isinstance(obj, list):
+ return type(obj)((map(_to_cuda, obj)))
+ return obj
 
  if len(self.device_ids) == 1:
  with torch.cuda.device(self.device_ids[0]):
  inputs_cuda = _to_cuda(inputs)
- return self.module(*inputs_cuda)
+ if kwargs:
+ gpu_dict = {}
+ for key in kwargs.keys():
+ gpu_dict[key] = _to_cuda(kwargs[key])
+ return self.module(*inputs_cuda, **gpu_dict)
+ else:
+ return self.module(*inputs_cuda)
+
  replicas = self.replicate(self.module, self.device_ids)
  scattered = self.scatter(inputs, self.device_ids)
+
+ gpu_dicts = None
+ if kwargs:
+ scatter_kwargs = {}
+ for key in kwargs.keys():
+ scatter_kwargs[key] = self.scatter(
+ _to_cuda(kwargs[key]), self.device_ids)
+ gpu_dicts = tuple(
+ {key: values[i] for key, values in scatter_kwargs.items()}
+ for i in self.device_ids
+ )
  replicas = replicas[:len(scattered)]
- outputs = self.parallel_apply(replicas, scattered)
+ outputs = self.parallel_apply(replicas, scattered, gpu_dicts)
  return self.gather(outputs, self.output_device)
 
  def replicate(self, module, device_ids):
  return replicate(module, device_ids)
 
  def scatter(self, input, device_ids):
- return scatter(input, device_ids)
+ return scatter(input, device_ids, dim=self.dim)
 
- def parallel_apply(self, replicas, inputs):
- return parallel_apply(replicas, inputs)
+ def parallel_apply(self, replicas, inputs, kwargs):
+ return parallel_apply(replicas, inputs, kwargs)
 
  def gather(self, outputs, output_device):
- return gather(outputs, output_device)
+ return gather(outputs, output_device, dim=self.dim)
 
 
-def data_parallel(module, inputs, device_ids, output_device=None):
+def data_parallel(module, inputs, device_ids, output_device=None, dim=0, module_kwargs=None):
  """Evaluates module(input) in parallel across the GPUs given in device_ids.
 
  This is the functional version of the DataParallel module.
@@ -96,13 +125,27 @@ def data_parallel(module, inputs, device_ids, output_device=None):
  inputs = (inputs,)
 
  if not device_ids:
- return module(*inputs)
+ if module_kwargs is None:
+ return module(*inputs)
+ else:
+ return module(*inputs, **module_kwargs)
 
  if output_device is None:
  output_device = device_ids[0]
 
  replicas = replicate(module, device_ids)
- scattered = scatter(inputs, device_ids)
+ scattered = scatter(inputs, device_ids, dim)
+
+ gpu_dicts = None
+ if module_kwargs:
+ scatter_kwargs = {}
+ for key in module_kwargs.keys():
+ scatter_kwargs[key] = scatter(module_kwargs[key], device_ids, dim)
+ gpu_dicts = tuple(
+ {key: values[i] for key, values in scatter_kwargs.items()}
+ for i in device_ids
+ )
+
  replicas = replicas[:len(scattered)]
- outputs = parallel_apply(replicas, scattered)
- return gather(outputs, output_device)
+ outputs = parallel_apply(replicas, scattered, gpu_dicts)
+ return gather(outputs, output_device, dim)
@@ -8,31 +8,36 @@
  import Queue as queue
 
 
-def parallel_apply(modules, inputs):
+def parallel_apply(modules, inputs, kwargs_tup=None):
  assert len(modules) == len(inputs)
+ if kwargs_tup:
+ assert len(modules) == len(kwargs_tup)
+ else:
+ kwargs_tup = ({},) * len(modules)
  # Fast track
  if len(modules) == 1:
- return (modules[0](*inputs[0]),)
+ return (modules[0](*inputs[0], **kwargs_tup[0]), )
 
  lock = threading.Lock()
  results = {}
 
- def _worker(module, input, results, lock):
+ def _worker(module, input, kwargs, results, lock):
  var_input = input
  while not isinstance(var_input, Variable):
  var_input = var_input[0]
  try:
  with torch.cuda.device_of(var_input):
- output = module(*input)
+ output = module(*input, **kwargs)
  with lock:
  results[input] = output
  except Exception as e:
  with lock:
  results[input] = e
 
  threads = [threading.Thread(target=_worker,
- args=(module, input, results, lock))
- for module, input in zip(modules, inputs)]
+ args=(module, input, kwargs, results, lock),
+ )
+ for module, input, kwargs in zip(modules, inputs, kwargs_tup)]
 
  for thread in threads:
  thread.start()
 
@@ -1,25 +1,37 @@
+import torch
 from torch.autograd import Variable
 from ._functions import Scatter, Gather
+from torch.cuda.comm import broadcast
 
 
-def scatter(input, target_gpus):
- """Slices a given variable into approximately equal chunks and distributes
- them accross given GPUs
+def scatter(input, target_gpus, dim=0):
+ """
+ Slices variables into approximately equal chunks and
+ distributes them accross given GPUs. Duplicates
+ references to objects that are not variables. Does not
+ support Tensors.
  """
  def scatter_map(obj):
  if isinstance(obj, Variable):
- return Scatter(target_gpus)(obj)
- return tuple(zip(*map(scatter_map, obj)))
+ return Scatter(target_gpus, dim=dim)(obj)
+ assert not torch.is_tensor(obj), "Tensors not supported in scatter."
+ if isinstance(obj, tuple) or isinstance(obj, list):
+ return type(obj)(zip(*map(scatter_map, obj)))
+ return tuple(obj for targets in target_gpus)
+
  return scatter_map(input)
 
 
-def gather(outputs, target_device):
- """Gathers variables from different GPUs on a specified device
- (-1 means the CPU).
+def gather(outputs, target_device, dim=0):
+ """
+ Gathers variables from different GPUs on a specified device
+ (-1 means the CPU).
  """
  def gather_map(outputs):
  out = outputs[0]
  if isinstance(out, Variable):
- return Gather(target_device)(*outputs)
+ return Gather(target_device, dim=dim)(*outputs)
+ if out is None:
+ return None
  return type(out)(map(gather_map, zip(*outputs)))
  return gather_map(outputs)
@@ -143,14 +143,14 @@ def __iter__(self):
  return iter(map(lambda i: self.select(0, i), _range(self.size(0))))
 
  def split(self, split_size, dim=0):
- """Splits this tensor into a list of tensors.
+ """Splits this tensor into a tuple of tensors.
 
  See :func:`torch.split`.
  """
  return torch.split(self, split_size, dim)
 
  def chunk(self, n_chunks, dim=0):
- """Splits this tensor into a list of tensors.
+ """Splits this tensor into a tuple of tensors.
 
  See :func:`torch.chunk`.
  """