elasticdeeplearning
diff --git a/‎python/edl/collective/launch.py‎
Lines changed: 11 additions & 186 deletions b/‎python/edl/collective/launch.py‎
Lines changed: 11 additions & 186 deletions
diff --git a/‎python/edl/tests/unittests/CMakeLists.txt‎
Lines changed: 1 addition & 2 deletions b/‎python/edl/tests/unittests/CMakeLists.txt‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎python/edl/tests/unittests/del_from_etcd.py‎
Lines changed: 7 additions & 12 deletions b/‎python/edl/tests/unittests/del_from_etcd.py‎
Lines changed: 7 additions & 12 deletions
diff --git a/‎python/edl/tests/unittests/etcd_test.sh‎
Lines changed: 0 additions & 5 deletions b/‎python/edl/tests/unittests/etcd_test.sh‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎python/edl/tests/unittests/etcd_test_base.py‎
Lines changed: 2 additions & 11 deletions b/‎python/edl/tests/unittests/etcd_test_base.py‎
Lines changed: 2 additions & 11 deletions
diff --git a/‎python/edl/tests/unittests/test_launch.py‎
Lines changed: 41 additions & 0 deletions b/‎python/edl/tests/unittests/test_launch.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎python/edl/tests/unittests/test_register.sh‎ renamed to ‎python/edl/tests/unittests/test_launch.sh‎
Lines changed: 2 additions & 2 deletions b/‎python/edl/tests/unittests/test_register.sh‎ renamed to ‎python/edl/tests/unittests/test_launch.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/edl/utils/cluster_generator.py‎
Lines changed: 4 additions & 3 deletions b/‎python/edl/utils/cluster_generator.py‎
Lines changed: 4 additions & 3 deletions
@@ -19,58 +19,19 @@
 from __future__ import print_function
 
 import sys
-import time
-import traceback
 from edl.utils import args_utils
-from edl.utils import constants
 from edl.utils import env as edl_env
 from edl.utils import etcd_db
-from edl.utils import exceptions
+from edl.utils import launcher as edl_launcher
 from edl.utils import log_utils
-from edl.utils import pod_server_client
 from edl.utils import status as edl_status
-from edl.utils import train_process as edl_train_process
-from edl.utils import resource_pods
+from edl.utils.log_utils import logger
+from edl.utils import pod as edl_pod
 
-from edl.utils import leader_pod
-from ..utils.log_utils import logger
-from ..utils import pod
-from ..utils import pod_server
-from ..utils import cluster_watcher
 
-
-def edl_barrier(job_env, pod, timeout):
- start = time.time()
-
- log_time = time.time()
- while True:
- try:
- etcd = etcd_db.get_global_etcd()
- leader = leader_pod.load_from_etcd(etcd)
- if leader is None:
- raise exceptions.EdlNotFoundLeader("can't get leader")
-
- logger.debug("barrier on leader:{}".format(leader))
-
- c = pod_server_client.Client(leader.endpoint)
- cluster = c.barrier(job_env.job_id, pod.get_id())
- return cluster
- except Exception as e:
- if time.time() - log_time > 30:
- logger.info("wait to barrier now!")
- log_time = time.time()
- logger.debug("barrier error:{} {}".format(e,
- traceback.format_exc()))
-
- if time.time() - start > timeout:
- message = "wait to barrier with all error:{} leader:[{}] current pod:[{}]".format(
- traceback.format_exc(), leader, pod)
- raise exceptions.EdlBarrierError(message)
-
- time.sleep(3)
-
-
-def prepare(args):
+def main():
+ log_utils.get_logger(log_level=10)
+ args = args_utils.parse_args()
  args_dict = args_utils.convert_args_to_dict(args)
 
  # job enviroment.
@@ -80,7 +41,7 @@ def prepare(args):
  # get global etcd and lock
  etcd = etcd_db.get_global_etcd(job_env.etcd_endpoints, job_env.job_id)
 
- last_status = edl_status.load_job_status_from_etcd(etcd)
+ last_status = edl_status.load_job_status_from_etcd(etcd, timeout=30)
  if last_status == edl_status.Status.SUCCEED:
  logger.info("job:{} has completed! Need't try!".format(job_env.job_id))
  sys.exit(0)
@@ -89,146 +50,10 @@ def prepare(args):
  pod = edl_pod.Pod()
  pod.from_env(job_env)
 
- # update pod status
- edl_status.save_pod_status_to_etcd(etcd,
- pod.get_id(), edl_status.Status.INITIAL)
-
- # launch pod server
- pod_server = PodServer(job_env, pod.get_id())
- pod_server.start(job_env, pod)
- logger.info("pod server started:[{}]".format(pod))
-
- return job_env, pod, pod_server
-
-
-def job_exit(cluster,
- leader_register,
- resource_register,
- watcher,
- pod,
- trainer_flag,
- register_flag,
- barrier_flag,
- resource_flag,
- timeout=300):
- local_flag = trainer_flag & register_flag & barrier_flag
- etcd = etcd_db.get_global_etcd()
- edl_status.save_pod_flag_to_ecd(etcd, pod.get_id(), local_flag)
-
- begin = time.time()
- while True:
- try:
- if leader_register.is_leader():
- if etcd.wait_resource(cluster, timeout=15):
- job_flag = trainer_flag & register_flag & barrier_flag & resource_flag
- edl_status.save_job_flag_to_etcd(etcd, job_flag)
- logger.info("set job status:{} ok!".format(job_flag))
- break
- raise exceptions.EdlWaitFollowersReleaseError(
- "can't wait resource")
- else:
- break
- except Exception as e:
- logger.warning("prepare job_exit meets error:{}".format(e))
- if time.time() - begin >= timeout:
- logger.warning("wait resource error")
- break
-
- time.sleep(3)
- continue
-
- leader_register.stop()
- watcher.stop()
- resource_register.stop()
-
-
-def launch(args):
- job_env, pod, pod_server = prepare(args)
-
- # register pod resource to tell others:
- # this resource can use to train
- resource_register = resource_pods.Register(job_env, pod)
-
- # seize the leader
- leader_register = leader_pod.Register(job_env, pod.get_id())
-
- # register rank and watch the rank
- # if the rank changed, the pods should restart the training proc.
- # pod exit if barrier error
- cluster = edl_barrier(job_env, pod, timeout=600)
-
- # update pod status
- etcd = etcd_db.get_global_etcd()
- edl_status.save_pod_status_to_etcd(etcd,
- pod.get_id(), edl_status.Status.RUNNING)
-
- # watcher after barrier
- watcher = cluster_watcher.Watcher(job_env, cluster, pod)
-
- procs = edl_train_process.start(
- cluster,
- pod,
- args.training_script,
- args.training_script_args,
- log_dir=args.log_dir)
-
- trainer_flag = True
- register_flag = True
- barrier_flag = True
- while True:
- # check local status first
- alive, trainer_flag = edl_train_process.watch(procs, pod.trainers_num)
- if not alive or not trainer_flag:
- break
-
- if resource_register.is_stopped() or leader_register.is_stopped():
- edl_train_process.terminate()
- register_flag = False
- break
-
- # check job status second
- if watcher.changed:
- new_cluster = edl_barrier(job_env, pod, timeout=60)
- if not new_cluster:
- barrier_flag = False
- break
-
- edl_train_process.terminate(procs)
-
- cluster = new_cluster
- watcher = cluster_watcher.Watcher(job_env, cluster, pod)
-
- procs = edl_train_process.start(
- job_env,
- cluster,
- pod,
- args.training_script,
- args.training_script_args,
- log_dir=args.log_dir)
-
- time.sleep(3)
-
- if not register_flag:
- logger.fatal("register meets error and local exit!")
-
- if not leader_register.is_leader():
- leader_register.stop()
-
- job_exit(
- cluster=cluster,
- leader_register=leader_register,
- resource_register=resource_register,
- watcher=watcher,
- pod=pod,
- trainer_flag=trainer_flag,
- register_flag=register_flag,
- barrier_flag=barrier_flag)
-
-
-def main():
- log_utils.get_logger(log_level=10)
- args = args_utils.parse_args()
- launch(args)
+ launcher = edl_launcher.Launcher(
+ job_env=job_env, pod=pod, etcd=etcd, args=args)
+ launcher.init()
+ launcher.launch()
 
 
 if __name__ == '__main__':
 
@@ -76,15 +76,14 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 LIST(REMOVE_ITEM TEST_OPS test_data_reader)
 LIST(REMOVE_ITEM TEST_OPS test_train)
+LIST(REMOVE_ITEM TEST_OPS test_launch)
 foreach(TEST_OP ${TEST_OPS})
  bash_test_modules(${TEST_OP} START_BASH etcd_test.sh ENVS "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}")
 endforeach(TEST_OP)
 
 # bash unit test
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.sh")
 string(REPLACE ".sh" "" TEST_OPS "${TEST_OPS}")
-LIST(REMOVE_ITEM TEST_OPS test_register)
-LIST(REMOVE_ITEM TEST_OPS test_launch)
 foreach(TEST_OP ${TEST_OPS})
  bash_test_modules(${TEST_OP} START_BASH "${TEST_OP}.sh" ENVS "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}")
 endforeach(TEST_OP)
@@ -12,19 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import edl.utils.constants as constants
 import os
-from edl.utils.etcd_db import get_global_etcd
+from edl.utils import constants
+from edl.discovery import etcd_client
+
+g_etcd_endpoints = "127.0.0.1:2379"
 
 job_id = os.environ["PADDLE_JOB_ID"]
 etcd_endpoints = os.environ["PADDLE_ETCD_ENDPOINTS"]
-
-db = get_global_etcd([etcd_endpoints], job_id)
-etcd = db._etcd
-etcd.remove_service(constants.ETCD_POD_RESOURCE)
-etcd.remove_service(constants.ETCD_POD_RANK)
-etcd.remove_service(constants.ETCD_POD_STATUS)
-etcd.remove_service(constants.ETCD_JOB_STATUS)
-etcd.remove_service(constants.ETCD_TRAIN_STATUS)
-etcd.remove_service(constants.ETCD_CLUSTER)
-etcd.remove_service(constants.ETCD_READER)
+etcd = etcd_client.EtcdClient([g_etcd_endpoints], root=job_id)
+etcd.init()
+constants.clean_etcd(etcd)
@@ -15,18 +15,13 @@ fi
 # rm flag file
 rm -f ${name}_*.log
 
-nohup etcd > ${name}_etcd.log 2>&1 &
-etcd_pid=$!
-
 # start the unit test
 run_time=$(( $TEST_TIMEOUT - 10 ))
 echo "run_time: ${run_time}"
 
 timeout -s SIGKILL ${run_time} ${PYTHON_EXECUTABLE} -u ${name}.py > ${name}_run.log 2>&1
 exit_code=$?
 
-kill -9 $etcd_pid
-
 echo "${name} faild with ${exit_code}"
 if [[ $exit_code -eq 0 ]]; then
  exit 0
 
@@ -24,15 +24,6 @@
 
 
 class EtcdTestBase(unittest.TestCase):
- def _clean_etcd(self):
- self._etcd.remove_service(constants.ETCD_POD_RESOURCE)
- self._etcd.remove_service(constants.ETCD_POD_RANK)
- self._etcd.remove_service(constants.ETCD_POD_STATUS)
- self._etcd.remove_service(constants.ETCD_JOB_STATUS)
- self._etcd.remove_service(constants.ETCD_TRAIN_STATUS)
- self._etcd.remove_service(constants.ETCD_CLUSTER)
- self._etcd.remove_service(constants.ETCD_READER)
-
  def setUp(self, job_id):
  log_utils.get_logger(log_level=10)
  self._etcd = EtcdClient([g_etcd_endpoints], root=job_id)
@@ -62,9 +53,9 @@ def setUp(self, job_id):
  os.environ.update(proc_env)
 
  self._job_env = edl_env.JobEnv(None)
- self._clean_etcd()
+ constants.clean_etcd(self._etcd)
 
  def tearDown(self):
  os.environ.clear()
  os.environ.update(self._old_environ)
- self._clean_etcd()
+ constants.clean_etcd(self._etcd)
@@ -0,0 +1,41 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+from edl.tests.unittests import etcd_test_base
+from edl.utils import env as edl_env
+from edl.utils import etcd_db
+from edl.utils import pod as edl_pod
+from edl.utils import status as edl_status
+from edl.utils.log_utils import logger
+from edl.utils import launcher as edl_launcher
+
+
+class TestLauncher(etcd_test_base.EtcdTestBase):
+ def setUp(self):
+ super(TestLauncher, self).setUp("test_launcher")
+
+ def test_normal_exit(self):
+ launcher = edl_launcher(self._job_env, self._pod, self._etcd, args)
+ launcher.init()
+ launcher.launch()
+
+ last_status = edl_status.load_job_status_from_etcd(self._etcd)
+ if last_status == edl_status.Status.SUCCEED:
+ logger.info("job:{} has completed! Need't try!".format(
+ self._job_env.job_id))
+ return
+ self.assertFalse(True)
@@ -26,7 +26,7 @@ export PADDLE_JOB_ID="test_success_job"
 export PADDLE_ETCD_ENDPOINTS="127.0.0.1:2379"
 export PADDLE_EDLNODES_RANAGE="2:2"
 export PADDLE_EDL_ONLY_FOR_CE_TEST="1"
-export PADDLE_EDL_HDFS_CHECKPOINT_PATH="./success_job"
+export PADDLE_EDL_HDFS_PATH="./success_job"
 export PADDLE_EDL_HDFS_HOME="./hadoop"
 
 #clean keys
@@ -43,7 +43,7 @@ export PADDLE_DEMO_EXIT_CODE=0
 timeout -s SIGKILL ${run_time} python -m edl.collective.launch --log_dir 01 launch_demo.py > ${name}_run_01.log 2>&1 &
 pid_01=$!
 
-key="/${PADDLE_JOB_ID}/job_flag/nodes/complete"
+key="/${PADDLE_JOB_ID}/job_flag/nodes/job_status"
 value=`etcdctl get ${key}`
 echo "job complete flag:${value}"
 
 
@@ -80,9 +80,10 @@ def _generate_cluster(self, timeout=600):
 
  def stop(self):
  self._stop.set()
- with self._lock:
- if self._t_register:
- self._t_register.join()
+ if self._t_register:
+ self._t_register.join()
+
+ with self._lock:
  self._t_register = None
 
  logger.debug("{} exit".format(self.__class__.__name__))