PaddlePaddle
diff --git a/‎paddle/fluid/framework/parallel_executor.cc‎
Lines changed: 4 additions & 0 deletions b/‎paddle/fluid/framework/parallel_executor.cc‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎python/paddle/fluid/parallel_executor.py‎
Lines changed: 8 additions & 4 deletions b/‎python/paddle/fluid/parallel_executor.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎python/paddle/fluid/tests/unittests/test_parallel_executor.py‎
Lines changed: 45 additions & 1 deletion b/‎python/paddle/fluid/tests/unittests/test_parallel_executor.py‎
Lines changed: 45 additions & 1 deletion
@@ -43,6 +43,10 @@ class ParallelExecutorPrivate {
 #endif
 };
 
+std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
+ return member_->local_scopes_;
+}
+
 ParallelExecutor::ParallelExecutor(
  size_t num_threads, bool use_event,
  const std::vector<platform::Place> &places,
 
@@ -23,11 +23,12 @@
 class ParallelExecutor(object):
  def __init__(self,
  use_cuda,
- main_program,
- startup_program,
  loss_name=None,
+ main_program=None,
+ startup_program=None,
  num_threads=None,
  allow_op_delay=False,
+ run_startup=True,
  share_vars_from=None):
  self._places = []
  self._act_places = []
@@ -55,13 +56,16 @@ def __init__(self,
 
  main = main_program
  startup = startup_program
+ main = main if main else framework.default_main_program()
+ startup = startup if startup else framework.default_startup_program()
  scope = executor.global_scope()
 
- if startup:
+ if run_startup:
  exe = executor.Executor(self._act_places[0])
  exe.run(startup)
 
- local_scopes = share_vars_from.local_scopes() if share_vars_from else []
+ local_scopes = share_vars_from.executor.local_scopes(
+ ) if share_vars_from else []
 
  self.executor = core.ParallelExecutor(
  num_threads,
 
@@ -207,7 +207,7 @@ def check_network_convergence(self,
  if memory_opt:
  fluid.memory_optimize(main)
 
- exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True)
+ exe = fluid.ParallelExecutor(True, loss_name=loss.name)
  if batch_size is not None:
  batch_size *= fluid.core.get_cuda_device_count()
  begin = time.time()
@@ -453,3 +453,47 @@ def setUpClass(cls):
  @unittest.skip("transformer is buggy in multi gpu")
  def test_main(self):
  self.check_network_convergence(transformer)
+
+
+class ParallelExecutorTestingDuringTraining(unittest.TestCase):
+ def test_parallel_testing(self):
+ main = fluid.Program()
+ startup = fluid.Program()
+ with fluid.program_guard(main, startup):
+ loss = simple_fc_net(True)
+ test_program = main.clone(for_test=True)
+
+ opt = fluid.optimizer.SGD(learning_rate=0.0001)
+ opt.minimize(loss)
+
+ batch_size = 32
+ image = numpy.random.normal(size=(batch_size,
+ 784)).astype('float32')
+ label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64")
+ place = fluid.CUDAPlace(0)
+ im_t = fluid.LoDTensor()
+ im_t.set(image, place)
+ lbl_t = fluid.LoDTensor()
+ lbl_t.set(label, place)
+ feed_dict = {'image': im_t, 'label': lbl_t}
+
+ train_exe = fluid.ParallelExecutor(
+ loss_name=loss.name,
+ use_cuda=True,
+ main_program=main,
+ startup_program=startup)
+
+ test_exe = fluid.ParallelExecutor(
+ use_cuda=True,
+ main_program=test_program,
+ startup_program=startup,
+ run_startup=False,
+ share_vars_from=train_exe)
+
+ for i in xrange(5):
+ test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+ test_loss = numpy.array(test_loss)
+
+ train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
+ train_loss = numpy.array(train_loss)
+ self.assertTrue(numpy.allclose(train_loss, test_loss))