Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ vendor/
.vscode/
*.pyc
build/
*.log
resnet50_pod/
.*.swp
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set(EDL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(EDL_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
SET(EDL_INSTALL_DIR ${CMAKE_BINARY_DIR}/output)
SET(CMAKE_INSTALL_RPATH "$ORIGIN" "${CMAKE_INSTALL_RPATH}")
project(paddle-edl)
project(paddle_edl)

include(python)

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ of EDL on this cluster is
- [Fault-Tolerant Training in PaddlePaddle](./doc/fault_tolerance.md).
- [Elastic Deep Learning Design Doc:compute engine](./doc/edl_collective_design_doc.md).
- [Elastic Deep Learning Design Doc:Scheduler](./doc/edl_design_doc.md).
- [Run Elastic Deep Learning Demo on a sinle node](./doc/collective_demo.md).

## FAQ

Expand Down
38 changes: 38 additions & 0 deletions doc/collective_demo.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Introduction
This demo is for developers of EDL: you can test Paddle EDL function without a Kubernetes cluster. And it's simple to test it on a none or multiple nodes.
Of course, this is also a toy. You can play with it!
Have fun!

# Install
1. Install EDL from source

```
git clone https://github.com/PaddlePaddle/edl
cd edl
mkdir build & cd build
cmake ..
pip install ./python/dist/paddle_edl-0.0.0-py2-none-any.whl
```

2. Install EDL using `pip install paddle_edl`.

# Run the demo on a single node
1. Start a Jobserver on one node.

```
git clone https://github.com/PaddlePaddle/edl
cd python/edl/demo/collective
./start_job_server.sh
```

2. Start a Jobclient on every node. Jobclient controls the POD process.

```
#Set the ImageNet data path
export PADDLE_EDL_IMAGENET_PATH=<your path>
#Set the checkpoint path
export PADDLE_EDL_FLEET_CHECKPOINT_PATH=<your path>

mkdir -p resnet50_pod
./start_job_client.sh
```
37 changes: 37 additions & 0 deletions doc/collective_demo_cn.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# 前言
在单节点或者多个节点(物理机器或者虚拟机或者Docker之类的)搭建EDL主要是为开发者准备的:没有集群的情况下也可以对Paddle(计算引擎)模拟进行EDL的测试。
当然,这个过程也有点意思,看着训练进程起起伏伏而且不影响最后的结果,还是蛮有意思的。
Have fun!

# 安装EDL
1. 你可以从源代码编译安装

```
git clone https://github.com/PaddlePaddle/edl
cd edl
mkdir build & cd build
cmake ..
pip install ./python/dist/paddle_edl-0.0.0-py2-none-any.whl
```

2. 也可以直接使用`pip`安装我们发布的版本`pip install paddle_edl`

# demo搭建步骤:以单节点为例
1. 我们需要在一个节点上启动JobServer的demo,用来记录训练任务的Pod信息。

```
git clone https://github.com/PaddlePaddle/edl
cd python/paddle_edl/demo/collective
./start_job_server.sh
```
2. 我们需要在(各个)节点上启动一个JobClient的demo,用来管理训练的Pod进程。

```
#指定ImageNet的数据目录路径
export PADDLE_EDL_IMAGENET_PATH=<your path>
#指定`checkpoint`的目录,用来保存checkpoint
export PADDLE_EDL_FLEET_CHECKPOINT_PATH=<your path>

mkdir -p resnet50_pod
./start_job_client.sh
```
4 changes: 2 additions & 2 deletions example/collective/resnet50/train_pretrain.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
export FLAGS_sync_nccl_allreduce=1
export FLAGS_cudnn_exhaustive_search=1
export FLAGS_conv_workspace_size_limit=4000 #MB
#export FLAGS_conv_workspace_size_limit=4000 #MB
export FLAGS_cudnn_batchnorm_spatial_persistent=1

export GLOG_v=1
Expand All @@ -18,7 +18,7 @@ if [[ ${use_dali} == "True" ]]; then
export FLAGS_fraction_of_gpu_memory_to_use=0.8
fi

python -m paddle-edl.launch ${distributed_args} \
python -m paddle_edl.collective.launch ${distributed_args} \
--log_dir log \
--log_level 20 \
./train_with_fleet.py \
Expand Down
6 changes: 3 additions & 3 deletions example/collective/resnet50/train_with_fleet.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,13 @@
add_arg('do_test', bool, False, "Whether do test every epoch.")
add_arg('use_gpu', bool, True, "Whether to use GPU or not.")
add_arg('fuse', bool, False, "Whether to use tensor fusion.")
add_arg('fuse_elewise_add_act_ops', bool, True, "Whether to use elementwise_act fusion.")
add_arg('fuse_bn_act_ops', bool, True, "Whether to use bn_act fusion.")
add_arg('fuse_elewise_add_act_ops', bool, False, "Whether to use elementwise_act fusion.")
add_arg('fuse_bn_act_ops', bool, False, "Whether to use bn_act fusion.")
add_arg('nccl_comm_num', int, 1, "nccl comm num")
add_arg("use_hierarchical_allreduce", bool, False, "Use hierarchical allreduce or not.")
add_arg('num_threads', int, 1, "Use num_threads to run the fluid program.")
add_arg('num_iteration_per_drop_scope', int, 100, "Ihe iteration intervals to clean up temporary variables.")
add_arg('benchmark_test', bool, True, "Whether to use print benchmark logs or not.")
add_arg('benchmark_test', bool, False, "Whether to use print benchmark logs or not.")

add_arg('use_dgc', bool, False, "Whether use DGCMomentum Optimizer or not")
add_arg('rampup_begin_step', int, 5008, "The beginning step from which dgc is implemented.")
Expand Down
4 changes: 2 additions & 2 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ file(GLOB_RECURSE EDL_FILES collective/*.py demo/*.py demo/*.sh setup.py)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
add_custom_command(
OUTPUT ${EDL_BINARY_DIR}/.timestamp
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/edl ${EDL_BINARY_DIR}/python/
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_edl ${EDL_BINARY_DIR}/python/
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} ./setup.py bdist_wheel
DEPENDS ${EDL_FILES})
add_custom_target(edl_python ALL DEPENDS ${EDL_BINARY_DIR}/.timestamp)
add_subdirectory(edl/tests/unittests)
add_subdirectory(paddle_edl/tests/unittests)
2 changes: 0 additions & 2 deletions python/edl/tests/unittests/start_edl_demo.sh

This file was deleted.

File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
import requests
import time
import sys
from utils import Cluster, Pod, Trainer, logger
from utils import Cluster, Pod, Trainer, logger, Hdfs
from http_store import kv_server


class Edlenv(object):
Expand Down Expand Up @@ -173,7 +174,7 @@ def edl_barrier(edl_env, hdfs, timeout=-1):
if pod.rank == 0 and not kv_server.is_alive():
kv_server.start("0.0.0.0", pod.port)

ret = edl_utils.barrier(cluster=cluster, pod=pod)
ret = barrier(cluster=cluster, pod=pod)
if ret:
break

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def start(self, host, port):
current_env["PYTHONUNBUFFERED"] = "1"

self._cmd = [
sys.executable, "-m", "paddle.distributed.http_store",
sys.executable, "-m", "paddle_edl.collective.http_store",
"--host={}".format(host), "--port={}".format(port)
]
logger.info("start http store:{}".format(self._cmd))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from contextlib import closing
import socket

from paddle.distributed.utils import *
from utils import *
import edl_utils
from http_store import kv_server

Expand Down Expand Up @@ -124,6 +124,7 @@ def _parse_args():
parser.add_argument(
"--log_dir",
type=str,
default=None,
help="The path for each process's log.If it's not set, the log will printed to default pipe."
)

Expand Down Expand Up @@ -169,10 +170,15 @@ def launch(args):
assert edl_env.is_under_edl(), "edl launch must run under edl env"

hdfs = edl_utils.get_hdfs_from_args(args)
cluster, pod = edl_barrier(edl_env, hdfs, timeout=15 * 60)
cluster, pod = edl_utils.edl_barrier(edl_env, hdfs, timeout=15 * 60)
logger.info("get cluster from edl:{}".format(cluster))

procs = start_local_trainers(cluster, pod)
procs = start_local_trainers(
cluster,
pod,
args.training_script,
args.training_script_args,
log_dir=args.log_dir)

while True:
cluster2, pod = edl_env.get_cluster(hdfs)
Expand All @@ -182,9 +188,15 @@ def launch(args):
format(cluster2, cluster))
terminate_local_procs(procs)

cluster, pod = edl_barrier(edl_env, hdfs, timeout=30 * 60)
cluster, pod = edl_utils.edl_barrier(
edl_env, hdfs, timeout=30 * 60)

procs = start_local_trainers(cluster, pod)
procs = start_local_trainers(
cluster,
pod,
args.training_script,
args.training_script_args,
log_dir=args.log_dir)

alive = watch_local_trainers(procs, cluster.trainers_nranks())

Expand All @@ -194,7 +206,7 @@ def launch(args):

time.sleep(3)

edl_barrier(edl_env, hdfs)
edl_utils.edl_barrier(edl_env, hdfs)


if __name__ == "__main__":
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@

import os
import sys
from edl.collective.edl_utils import Edlenv
from edl.collective.utils import logger, get_logger, terminate_local_procs, get_host_name_ip
from paddle_edl.collective.edl_utils import Edlenv
from paddle_edl.collective.utils import logger, get_logger, terminate_local_procs, get_host_name_ip
from argparse import ArgumentParser, REMAINDER
import six
import copy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import argparse
import copy
import functools
import edl.collective.utils as utils
import paddle_edl.collective.utils as utils

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(utils.add_arguments, argparser=parser)
Expand All @@ -22,7 +22,7 @@
add_arg('del_pods_one_step', int, 1, "")
add_arg('add_pods_one_step', int, 1, "")
add_arg('time_interval_to_change', int, 900, "")
add_arg('server_port', int, 6070, "")
add_arg('server_port', int, 8180, "")

random.seed(10)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ while true ; do
done


src_dir=../../../example/collective/resnet50
src_dir=../../../../example/collective/resnet50
dst_dir=resnet50_pod/${pod_id}

echo "mkdir resnet50_pod/${pod_id}"
Expand All @@ -29,10 +29,9 @@ cp -r ${src_dir}/models ${dst_dir}/models
cp -r ${src_dir}/scripts ${dst_dir}/scripts

if [[ ! -d "${dst_dir}/ImageNet" ]]; then
ln -s /root/go/dataset/ImageNet/ ${dst_dir}/
ln -s ${PADDLE_EDL_IMAGENET_PATH} ${dst_dir}/
fi


if [[ ! -d "${dst_dir}/fleet_checkpoints" ]]; then
ln -s /root/go/checkpoints/resnet50/fleet_checkpoints ${dst_dir}/fleet_checkpoints
ln -s ${PADDLE_EDL_FLEET_CHECKPOINT_PATH} ${dst_dir}/fleet_checkpoints
fi
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ echo "${BASEDIR}"

nohup python -u ${BASEDIR}/job_server_demo.py \
--node_ips ${node_ips} \
--pod_num_of_node 2 \
--pod_num_of_node 8 \
--time_interval_to_change 900 \
--gpu_num_of_node 8 > job_server.log 2>&1 &
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
# limitations under the License.
import os
pod_id = os.getenv("PADDLE_POD_ID", "")
print(pod_id)
print(pod_id + "__edl_demo__")
2 changes: 2 additions & 0 deletions python/paddle_edl/tests/unittests/start_edl_demo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
python -m paddle_edl.collective.launch edl_demo.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
set -e
#set -x

echo "python_path:${PYTHONPATH}"
unset http_proxy https_proxy
Expand All @@ -8,7 +8,9 @@ unset http_proxy https_proxy
BASEDIR=$(dirname $(readlink -f $0))
echo "${BASEDIR}"

nohup python -m edl.demo.collective.job_server_demo --pod_num_of_node 2 \
rm -rf job_server.log job_client.log ./edl_demo_log

nohup python -m paddle_edl.demo.collective.job_server_demo --pod_num_of_node 2 \
--time_interval_to_change 900 \
--gpu_num_of_node 2 \
--pod_num_of_node 2 \
Expand All @@ -24,7 +26,7 @@ export PADDLE_JOBSERVER="http://127.0.0.1:8180"
export PADDLE_JOB_ID="test_job_id_1234"
export PADDLE_POD_ID="not set"

nohup python -m edl.demo.collective.job_client_demo \
nohup python -m paddle_edl.demo.collective.job_client_demo \
--log_level 20 \
--log_dir ./edl_demo_log \
./start_edl_demo.sh > job_client.log 2>&1 &
Expand All @@ -34,7 +36,7 @@ echo "launcher_pid:${job_client_pid}"
sleep 30s

echo "test request and response"
str="pod_0_0"
str="pod_0_0__edl_demo__"
file=./edl_demo_log/pod_pod_0_0.log

kill ${server_pid} ${job_client_pid}
Expand All @@ -47,5 +49,9 @@ else
cat job_server.log
echo "job_client.log"
cat job_client.log
echo "pod pod 0"
cat edl_demo_log/pod_pod_0_0.log
echo "pod pod 1"
cat edl_demo_log/pod_pod_1_0.log
exit -1
fi
Loading