1919from __future__ import print_function
2020
2121import sys
22- import time
23- import traceback
2422from edl .utils import args_utils
25- from edl .utils import constants
2623from edl .utils import env as edl_env
2724from edl .utils import etcd_db
28- from edl .utils import exceptions
25+ from edl .utils import launcher as edl_launcher
2926from edl .utils import log_utils
30- from edl .utils import pod_server_client
3127from edl .utils import status as edl_status
32- from edl .utils import train_process as edl_train_process
33- from edl .utils import resource_pods
28+ from edl .utils . log_utils import logger
29+ from edl .utils import pod as edl_pod
3430
35- from edl .utils import leader_pod
36- from ..utils .log_utils import logger
37- from ..utils import pod
38- from ..utils import pod_server
39- from ..utils import cluster_watcher
4031
41-
42- def edl_barrier (job_env , pod , timeout ):
43- start = time .time ()
44-
45- log_time = time .time ()
46- while True :
47- try :
48- etcd = etcd_db .get_global_etcd ()
49- leader = leader_pod .load_from_etcd (etcd )
50- if leader is None :
51- raise exceptions .EdlNotFoundLeader ("can't get leader" )
52-
53- logger .debug ("barrier on leader:{}" .format (leader ))
54-
55- c = pod_server_client .Client (leader .endpoint )
56- cluster = c .barrier (job_env .job_id , pod .get_id ())
57- return cluster
58- except Exception as e :
59- if time .time () - log_time > 30 :
60- logger .info ("wait to barrier now!" )
61- log_time = time .time ()
62- logger .debug ("barrier error:{} {}" .format (e ,
63- traceback .format_exc ()))
64-
65- if time .time () - start > timeout :
66- message = "wait to barrier with all error:{} leader:[{}] current pod:[{}]" .format (
67- traceback .format_exc (), leader , pod )
68- raise exceptions .EdlBarrierError (message )
69-
70- time .sleep (3 )
71-
72-
73- def prepare (args ):
32+ def main ():
33+ log_utils .get_logger (log_level = 10 )
34+ args = args_utils .parse_args ()
7435 args_dict = args_utils .convert_args_to_dict (args )
7536
7637 # job enviroment.
@@ -80,7 +41,7 @@ def prepare(args):
8041 # get global etcd and lock
8142 etcd = etcd_db .get_global_etcd (job_env .etcd_endpoints , job_env .job_id )
8243
83- last_status = edl_status .load_job_status_from_etcd (etcd )
44+ last_status = edl_status .load_job_status_from_etcd (etcd , timeout = 30 )
8445 if last_status == edl_status .Status .SUCCEED :
8546 logger .info ("job:{} has completed! Need't try!" .format (job_env .job_id ))
8647 sys .exit (0 )
@@ -89,146 +50,10 @@ def prepare(args):
8950 pod = edl_pod .Pod ()
9051 pod .from_env (job_env )
9152
92- # update pod status
93- edl_status .save_pod_status_to_etcd (etcd ,
94- pod .get_id (), edl_status .Status .INITIAL )
95-
96- # launch pod server
97- pod_server = PodServer (job_env , pod .get_id ())
98- pod_server .start (job_env , pod )
99- logger .info ("pod server started:[{}]" .format (pod ))
100-
101- return job_env , pod , pod_server
102-
103-
104- def job_exit (cluster ,
105- leader_register ,
106- resource_register ,
107- watcher ,
108- pod ,
109- trainer_flag ,
110- register_flag ,
111- barrier_flag ,
112- resource_flag ,
113- timeout = 300 ):
114- local_flag = trainer_flag & register_flag & barrier_flag
115- etcd = etcd_db .get_global_etcd ()
116- edl_status .save_pod_flag_to_ecd (etcd , pod .get_id (), local_flag )
117-
118- begin = time .time ()
119- while True :
120- try :
121- if leader_register .is_leader ():
122- if etcd .wait_resource (cluster , timeout = 15 ):
123- job_flag = trainer_flag & register_flag & barrier_flag & resource_flag
124- edl_status .save_job_flag_to_etcd (etcd , job_flag )
125- logger .info ("set job status:{} ok!" .format (job_flag ))
126- break
127- raise exceptions .EdlWaitFollowersReleaseError (
128- "can't wait resource" )
129- else :
130- break
131- except Exception as e :
132- logger .warning ("prepare job_exit meets error:{}" .format (e ))
133- if time .time () - begin >= timeout :
134- logger .warning ("wait resource error" )
135- break
136-
137- time .sleep (3 )
138- continue
139-
140- leader_register .stop ()
141- watcher .stop ()
142- resource_register .stop ()
143-
144-
145- def launch (args ):
146- job_env , pod , pod_server = prepare (args )
147-
148- # register pod resource to tell others:
149- # this resource can use to train
150- resource_register = resource_pods .Register (job_env , pod )
151-
152- # seize the leader
153- leader_register = leader_pod .Register (job_env , pod .get_id ())
154-
155- # register rank and watch the rank
156- # if the rank changed, the pods should restart the training proc.
157- # pod exit if barrier error
158- cluster = edl_barrier (job_env , pod , timeout = 600 )
159-
160- # update pod status
161- etcd = etcd_db .get_global_etcd ()
162- edl_status .save_pod_status_to_etcd (etcd ,
163- pod .get_id (), edl_status .Status .RUNNING )
164-
165- # watcher after barrier
166- watcher = cluster_watcher .Watcher (job_env , cluster , pod )
167-
168- procs = edl_train_process .start (
169- cluster ,
170- pod ,
171- args .training_script ,
172- args .training_script_args ,
173- log_dir = args .log_dir )
174-
175- trainer_flag = True
176- register_flag = True
177- barrier_flag = True
178- while True :
179- # check local status first
180- alive , trainer_flag = edl_train_process .watch (procs , pod .trainers_num )
181- if not alive or not trainer_flag :
182- break
183-
184- if resource_register .is_stopped () or leader_register .is_stopped ():
185- edl_train_process .terminate ()
186- register_flag = False
187- break
188-
189- # check job status second
190- if watcher .changed :
191- new_cluster = edl_barrier (job_env , pod , timeout = 60 )
192- if not new_cluster :
193- barrier_flag = False
194- break
195-
196- edl_train_process .terminate (procs )
197-
198- cluster = new_cluster
199- watcher = cluster_watcher .Watcher (job_env , cluster , pod )
200-
201- procs = edl_train_process .start (
202- job_env ,
203- cluster ,
204- pod ,
205- args .training_script ,
206- args .training_script_args ,
207- log_dir = args .log_dir )
208-
209- time .sleep (3 )
210-
211- if not register_flag :
212- logger .fatal ("register meets error and local exit!" )
213-
214- if not leader_register .is_leader ():
215- leader_register .stop ()
216-
217- job_exit (
218- cluster = cluster ,
219- leader_register = leader_register ,
220- resource_register = resource_register ,
221- watcher = watcher ,
222- pod = pod ,
223- trainer_flag = trainer_flag ,
224- register_flag = register_flag ,
225- barrier_flag = barrier_flag )
226-
227-
228- def main ():
229- log_utils .get_logger (log_level = 10 )
230- args = args_utils .parse_args ()
231- launch (args )
53+ launcher = edl_launcher .Launcher (
54+ job_env = job_env , pod = pod , etcd = etcd , args = args )
55+ launcher .init ()
56+ launcher .launch ()
23257
23358
23459if __name__ == '__main__' :
0 commit comments