elasticdeeplearning · gongweibao · Apr 8, 2020 · Apr 8, 2020 · Apr 8, 2020 · Apr 8, 2020
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,6 @@ vendor/
 .vscode/
 *.pyc
 build/
+*.log
+resnet50_pod/
+.*.swp
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,7 +4,7 @@ set(EDL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(EDL_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 SET(EDL_INSTALL_DIR ${CMAKE_BINARY_DIR}/output)
 SET(CMAKE_INSTALL_RPATH "$ORIGIN" "${CMAKE_INSTALL_RPATH}")
-project(paddle-edl)
+project(paddle_edl)
 
 include(python)
 

diff --git a/README.md b/README.md
@@ -38,6 +38,7 @@ of EDL on this cluster is
  - [Fault-Tolerant Training in PaddlePaddle](./doc/fault_tolerance.md).
  - [Elastic Deep Learning Design Doc:compute engine](./doc/edl_collective_design_doc.md).
  - [Elastic Deep Learning Design Doc:Scheduler](./doc/edl_design_doc.md).
+ - [Run Elastic Deep Learning Demo on a sinle node](./doc/collective_demo.md).
 
 ## FAQ
 

diff --git a/doc/collective_demo.md b/doc/collective_demo.md
@@ -0,0 +1,38 @@
+# Introduction
+This demo is for developers of EDL: you can test Paddle EDL function without a Kubernetes cluster. And it's simple to test it on a none or multiple nodes.
+Of course, this is also a toy. You can play with it!
+Have fun!
+
+# Install
+1. Install EDL from source
+
+```
+git clone https://github.com/PaddlePaddle/edl
+cd edl
+mkdir build & cd build
+cmake ..
+pip install ./python/dist/paddle_edl-0.0.0-py2-none-any.whl
+```
+
+2. Install EDL using `pip install paddle_edl`. 
+
+# Run the demo on a single node
+1. Start a Jobserver on one node. 
+
+```
+git clone https://github.com/PaddlePaddle/edl
+cd python/edl/demo/collective
+./start_job_server.sh
+```
+
+2. Start a Jobclient on every node. Jobclient controls the POD process.
+
+```
+#Set the ImageNet data path
+export PADDLE_EDL_IMAGENET_PATH=<your path>
+#Set the checkpoint path
+export PADDLE_EDL_FLEET_CHECKPOINT_PATH=<your path>
+
+mkdir -p resnet50_pod
+./start_job_client.sh
+```
diff --git a/doc/collective_demo_cn.md b/doc/collective_demo_cn.md
@@ -0,0 +1,37 @@
+# 前言
+在单节点或者多个节点（物理机器或者虚拟机或者Docker之类的）搭建EDL主要是为开发者准备的：没有集群的情况下也可以对Paddle(计算引擎)模拟进行EDL的测试。
+当然，这个过程也有点意思，看着训练进程起起伏伏而且不影响最后的结果，还是蛮有意思的。
+Have fun!
+
+# 安装EDL
+1. 你可以从源代码编译安装
+
+```
+git clone https://github.com/PaddlePaddle/edl
+cd edl
+mkdir build & cd build
+cmake ..
+pip install ./python/dist/paddle_edl-0.0.0-py2-none-any.whl
+```
+
+2. 也可以直接使用`pip`安装我们发布的版本`pip install paddle_edl`
+
+# demo搭建步骤：以单节点为例
+1. 我们需要在一个节点上启动JobServer的demo，用来记录训练任务的Pod信息。
+
+```
+git clone https://github.com/PaddlePaddle/edl
+cd python/paddle_edl/demo/collective
+./start_job_server.sh
+```
+2. 我们需要在(各个)节点上启动一个JobClient的demo，用来管理训练的Pod进程。 
+
+```
+#指定ImageNet的数据目录路径
+export PADDLE_EDL_IMAGENET_PATH=<your path>
+#指定`checkpoint`的目录，用来保存checkpoint
+export PADDLE_EDL_FLEET_CHECKPOINT_PATH=<your path>
+
+mkdir -p resnet50_pod
+./start_job_client.sh
+```
diff --git a/example/collective/resnet50/train_pretrain.sh b/example/collective/resnet50/train_pretrain.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 export FLAGS_sync_nccl_allreduce=1
 export FLAGS_cudnn_exhaustive_search=1
-export FLAGS_conv_workspace_size_limit=4000 #MB
+#export FLAGS_conv_workspace_size_limit=4000 #MB
 export FLAGS_cudnn_batchnorm_spatial_persistent=1
 
 export GLOG_v=1
@@ -18,7 +18,7 @@ if [[ ${use_dali} == "True" ]]; then
  export FLAGS_fraction_of_gpu_memory_to_use=0.8
 fi
 
-python -m paddle-edl.launch ${distributed_args} \
+python -m paddle_edl.collective.launch ${distributed_args} \
  --log_dir log \
  --log_level 20 \
  ./train_with_fleet.py \

diff --git a/example/collective/resnet50/train_with_fleet.py b/example/collective/resnet50/train_with_fleet.py
@@ -87,13 +87,13 @@
 add_arg('do_test', bool, False, "Whether do test every epoch.")
 add_arg('use_gpu', bool, True, "Whether to use GPU or not.")
 add_arg('fuse', bool, False, "Whether to use tensor fusion.")
-add_arg('fuse_elewise_add_act_ops', bool, True, "Whether to use elementwise_act fusion.")
-add_arg('fuse_bn_act_ops', bool, True, "Whether to use bn_act fusion.")
+add_arg('fuse_elewise_add_act_ops', bool, False, "Whether to use elementwise_act fusion.")
+add_arg('fuse_bn_act_ops', bool, False, "Whether to use bn_act fusion.")
 add_arg('nccl_comm_num', int, 1, "nccl comm num")
 add_arg("use_hierarchical_allreduce", bool, False, "Use hierarchical allreduce or not.")
 add_arg('num_threads', int, 1, "Use num_threads to run the fluid program.")
 add_arg('num_iteration_per_drop_scope', int, 100, "Ihe iteration intervals to clean up temporary variables.")
-add_arg('benchmark_test', bool, True, "Whether to use print benchmark logs or not.")
+add_arg('benchmark_test', bool, False, "Whether to use print benchmark logs or not.")
 
 add_arg('use_dgc', bool, False, "Whether use DGCMomentum Optimizer or not")
 add_arg('rampup_begin_step', int, 5008, "The beginning step from which dgc is implemented.")

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
@@ -2,8 +2,8 @@ file(GLOB_RECURSE EDL_FILES collective/*.py demo/*.py demo/*.sh setup.py)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 add_custom_command(
 OUTPUT ${EDL_BINARY_DIR}/.timestamp
-COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/edl ${EDL_BINARY_DIR}/python/
+COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_edl ${EDL_BINARY_DIR}/python/
 COMMAND env ${py_env} ${PYTHON_EXECUTABLE} ./setup.py bdist_wheel
  DEPENDS ${EDL_FILES})
 add_custom_target(edl_python ALL DEPENDS ${EDL_BINARY_DIR}/.timestamp)
-add_subdirectory(edl/tests/unittests)
+add_subdirectory(paddle_edl/tests/unittests)
diff --git a/python/edl/tests/unittests/start_edl_demo.sh b/python/edl/tests/unittests/start_edl_demo.sh
diff --git a/python/edl/__init__.py → python/paddle_edl/__init__.py b/python/edl/__init__.py → python/paddle_edl/__init__.py
diff --git a/python/edl/collective/__init__.py → python/paddle_edl/collective/__init__.py b/python/edl/collective/__init__.py → python/paddle_edl/collective/__init__.py
diff --git a/python/edl/collective/cloud_utils.py → python/paddle_edl/collective/cloud_utils.py b/python/edl/collective/cloud_utils.py → python/paddle_edl/collective/cloud_utils.py
diff --git a/python/edl/collective/edl_utils.py → python/paddle_edl/collective/edl_utils.py b/python/edl/collective/edl_utils.py → python/paddle_edl/collective/edl_utils.py
@@ -16,7 +16,8 @@
 import requests
 import time
 import sys
-from utils import Cluster, Pod, Trainer, logger
+from utils import Cluster, Pod, Trainer, logger, Hdfs
+from http_store import kv_server
 
 
 class Edlenv(object):
@@ -173,7 +174,7 @@ def edl_barrier(edl_env, hdfs, timeout=-1):
  if pod.rank == 0 and not kv_server.is_alive():
  kv_server.start("0.0.0.0", pod.port)
 
- ret = edl_utils.barrier(cluster=cluster, pod=pod)
+ ret = barrier(cluster=cluster, pod=pod)
  if ret:
  break
 

diff --git a/python/edl/collective/http_store.py → python/paddle_edl/collective/http_store.py b/python/edl/collective/http_store.py → python/paddle_edl/collective/http_store.py
@@ -113,7 +113,7 @@ def start(self, host, port):
  current_env["PYTHONUNBUFFERED"] = "1"
 
  self._cmd = [
- sys.executable, "-m", "paddle.distributed.http_store",
+ sys.executable, "-m", "paddle_edl.collective.http_store",
  "--host={}".format(host), "--port={}".format(port)
  ]
  logger.info("start http store:{}".format(self._cmd))

diff --git a/python/edl/collective/launch.py → python/paddle_edl/collective/launch.py b/python/edl/collective/launch.py → python/paddle_edl/collective/launch.py
@@ -48,7 +48,7 @@
 from contextlib import closing
 import socket
 
-from paddle.distributed.utils import *
+from utils import *
 import edl_utils
 from http_store import kv_server
 
@@ -124,6 +124,7 @@ def _parse_args():
  parser.add_argument(
  "--log_dir",
  type=str,
+ default=None,
  help="The path for each process's log.If it's not set, the log will printed to default pipe."
  )
 
@@ -169,10 +170,15 @@ def launch(args):
  assert edl_env.is_under_edl(), "edl launch must run under edl env"
 
  hdfs = edl_utils.get_hdfs_from_args(args)
- cluster, pod = edl_barrier(edl_env, hdfs, timeout=15 * 60)
+ cluster, pod = edl_utils.edl_barrier(edl_env, hdfs, timeout=15 * 60)
  logger.info("get cluster from edl:{}".format(cluster))
 
- procs = start_local_trainers(cluster, pod)
+ procs = start_local_trainers(
+ cluster,
+ pod,
+ args.training_script,
+ args.training_script_args,
+ log_dir=args.log_dir)
 
  while True:
  cluster2, pod = edl_env.get_cluster(hdfs)
@@ -182,9 +188,15 @@ def launch(args):
  format(cluster2, cluster))
  terminate_local_procs(procs)
 
- cluster, pod = edl_barrier(edl_env, hdfs, timeout=30 * 60)
+ cluster, pod = edl_utils.edl_barrier(
+ edl_env, hdfs, timeout=30 * 60)
 
- procs = start_local_trainers(cluster, pod)
+ procs = start_local_trainers(
+ cluster,
+ pod,
+ args.training_script,
+ args.training_script_args,
+ log_dir=args.log_dir)
 
  alive = watch_local_trainers(procs, cluster.trainers_nranks())
 
@@ -194,7 +206,7 @@ def launch(args):
 
  time.sleep(3)
 
- edl_barrier(edl_env, hdfs)
+ edl_utils.edl_barrier(edl_env, hdfs)
 
 
 if __name__ == "__main__":

diff --git a/python/edl/collective/utils.py → python/paddle_edl/collective/utils.py b/python/edl/collective/utils.py → python/paddle_edl/collective/utils.py
diff --git a/python/edl/demo/__init__.py → python/paddle_edl/demo/__init__.py b/python/edl/demo/__init__.py → python/paddle_edl/demo/__init__.py
diff --git a/python/edl/demo/collective/__init__.py → ...on/paddle_edl/demo/collective/__init__.py b/python/edl/demo/collective/__init__.py → ...on/paddle_edl/demo/collective/__init__.py
diff --git a/...on/edl/demo/collective/job_client_demo.py → ...le_edl/demo/collective/job_client_demo.py b/...on/edl/demo/collective/job_client_demo.py → ...le_edl/demo/collective/job_client_demo.py
@@ -14,8 +14,8 @@
 
 import os
 import sys
-from edl.collective.edl_utils import Edlenv
-from edl.collective.utils import logger, get_logger, terminate_local_procs, get_host_name_ip
+from paddle_edl.collective.edl_utils import Edlenv
+from paddle_edl.collective.utils import logger, get_logger, terminate_local_procs, get_host_name_ip
 from argparse import ArgumentParser, REMAINDER
 import six
 import copy

diff --git a/...on/edl/demo/collective/job_server_demo.py → ...le_edl/demo/collective/job_server_demo.py b/...on/edl/demo/collective/job_server_demo.py → ...le_edl/demo/collective/job_server_demo.py
@@ -9,7 +9,7 @@
 import argparse
 import copy
 import functools
-import edl.collective.utils as utils
+import paddle_edl.collective.utils as utils
 
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(utils.add_arguments, argparser=parser)
@@ -22,7 +22,7 @@
 add_arg('del_pods_one_step', int, 1, "")
 add_arg('add_pods_one_step', int, 1, "")
 add_arg('time_interval_to_change', int, 900, "")
-add_arg('server_port', int, 6070, "")
+add_arg('server_port', int, 8180, "")
 
 random.seed(10)
 

diff --git a/...n/edl/demo/collective/resnet50/package.sh → ...e_edl/demo/collective/resnet50/package.sh b/...n/edl/demo/collective/resnet50/package.sh → ...e_edl/demo/collective/resnet50/package.sh
@@ -15,7 +15,7 @@ while true ; do
 done
 
 
-src_dir=../../../example/collective/resnet50
+src_dir=../../../../example/collective/resnet50
 dst_dir=resnet50_pod/${pod_id}
 
 echo "mkdir resnet50_pod/${pod_id}"
@@ -29,10 +29,9 @@ cp -r ${src_dir}/models ${dst_dir}/models
 cp -r ${src_dir}/scripts ${dst_dir}/scripts
 
 if [[ ! -d "${dst_dir}/ImageNet" ]]; then
- ln -s /root/go/dataset/ImageNet/ ${dst_dir}/
+ ln -s ${PADDLE_EDL_IMAGENET_PATH} ${dst_dir}/
 fi
 
-
 if [[ ! -d "${dst_dir}/fleet_checkpoints" ]]; then
- ln -s /root/go/checkpoints/resnet50/fleet_checkpoints ${dst_dir}/fleet_checkpoints
+ ln -s ${PADDLE_EDL_FLEET_CHECKPOINT_PATH} ${dst_dir}/fleet_checkpoints
 fi
diff --git a/...n/edl/demo/collective/start_job_client.sh → ...e_edl/demo/collective/start_job_client.sh b/...n/edl/demo/collective/start_job_client.sh → ...e_edl/demo/collective/start_job_client.sh
diff --git a/...n/edl/demo/collective/start_job_server.sh → ...e_edl/demo/collective/start_job_server.sh b/...n/edl/demo/collective/start_job_server.sh → ...e_edl/demo/collective/start_job_server.sh
@@ -10,6 +10,6 @@ echo "${BASEDIR}"
 
 nohup python -u ${BASEDIR}/job_server_demo.py \
  --node_ips ${node_ips} \
- --pod_num_of_node 2 \
+ --pod_num_of_node 8 \
  --time_interval_to_change 900 \
  --gpu_num_of_node 8 > job_server.log 2>&1 &
diff --git a/python/edl/tests/unittests/CMakeLists.txt → ...paddle_edl/tests/unittests/CMakeLists.txt b/python/edl/tests/unittests/CMakeLists.txt → ...paddle_edl/tests/unittests/CMakeLists.txt
diff --git a/python/edl/tests/unittests/edl_demo.py → ...on/paddle_edl/tests/unittests/edl_demo.py b/python/edl/tests/unittests/edl_demo.py → ...on/paddle_edl/tests/unittests/edl_demo.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 import os
 pod_id = os.getenv("PADDLE_POD_ID", "")
-print(pod_id)
+print(pod_id + "__edl_demo__")
diff --git a/python/paddle_edl/tests/unittests/start_edl_demo.sh b/python/paddle_edl/tests/unittests/start_edl_demo.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+python -m paddle_edl.collective.launch edl_demo.py
diff --git a/python/edl/tests/unittests/test_edl.sh → ...on/paddle_edl/tests/unittests/test_edl.sh b/python/edl/tests/unittests/test_edl.sh → ...on/paddle_edl/tests/unittests/test_edl.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-set -e
+#set -x
 
 echo "python_path:${PYTHONPATH}"
 unset http_proxy https_proxy
@@ -8,7 +8,9 @@ unset http_proxy https_proxy
 BASEDIR=$(dirname $(readlink -f $0))
 echo "${BASEDIR}"
 
-nohup python -m edl.demo.collective.job_server_demo --pod_num_of_node 2 \
+rm -rf job_server.log job_client.log ./edl_demo_log
+
+nohup python -m paddle_edl.demo.collective.job_server_demo --pod_num_of_node 2 \
  --time_interval_to_change 900 \
  --gpu_num_of_node 2 \
  --pod_num_of_node 2 \
@@ -24,7 +26,7 @@ export PADDLE_JOBSERVER="http://127.0.0.1:8180"
 export PADDLE_JOB_ID="test_job_id_1234"
 export PADDLE_POD_ID="not set"
 
-nohup python -m edl.demo.collective.job_client_demo \
+nohup python -m paddle_edl.demo.collective.job_client_demo \
  --log_level 20 \
  --log_dir ./edl_demo_log \
  ./start_edl_demo.sh > job_client.log 2>&1 &
@@ -34,7 +36,7 @@ echo "launcher_pid:${job_client_pid}"
 sleep 30s
 
 echo "test request and response"
-str="pod_0_0"
+str="pod_0_0__edl_demo__"
 file=./edl_demo_log/pod_pod_0_0.log
 
 kill ${server_pid} ${job_client_pid}
@@ -47,5 +49,9 @@ else
  cat job_server.log
  echo "job_client.log"
  cat job_client.log
+ echo "pod pod 0"
+ cat edl_demo_log/pod_pod_0_0.log
+ echo "pod pod 1"
+ cat edl_demo_log/pod_pod_1_0.log
  exit -1
 fi
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,6 @@ vendor/ @@
  .vscode/
  *.pyc
  build/
+ *.log
+ resnet50_pod/
+ .*.swp
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#!/bin/bash
		python -m paddle_edl.collective.launch edl_demo.py