Skip to content

Commit 90b95be

Browse files
authored
[launch] fix log more stable; default to stdout (#41314)
1 parent 0f6412c commit 90b95be

File tree

4 files changed

+19
-14
lines changed

4 files changed

+19
-14
lines changed

python/paddle/distributed/launch/context/node.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def get_ports_occupied(self):
4444
return self.free_ports
4545

4646
def get_free_port(self):
47+
# for loop to avoid port conflict
4748
for _ in range(100):
4849
with closing(socket.socket(socket.AF_INET,
4950
socket.SOCK_STREAM)) as s:

python/paddle/distributed/launch/controllers/controller.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,9 @@ def watch(self) -> bool:
7575
while not self.ctx.status.is_done():
7676
status = self.pod.watch(timeout=2)
7777

78-
if self.ctx.continous_log():
79-
self.pod.logs()
78+
#if self.ctx.continous_log():
79+
# default to print log
80+
self.pod.logs()
8081

8182
# completed
8283
if status == self.ctx.status.COMPLETED:

python/paddle/distributed/launch/job/container.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -145,31 +145,34 @@ def __str__(self):
145145
self.errfile,
146146
self._env, )
147147

148-
def logs(self, fn=None, offset=0, whence=1, lines=1000):
148+
def logs(self, fn=None, offset=0, whence=1, limit=1000):
149149
if not self._log_handler:
150150
self._log_handler = open(self._out)
151151

152152
if fn is None:
153153
fn = sys.stdout
154154

155-
self._log_handler.seek(offset, whence)
156-
157155
try:
158-
idx = 0
159-
for line in self._log_handler:
160-
fn.write(line)
161-
idx += 1
162-
if idx > lines:
156+
if offset != 0 or whence != 1:
157+
self._log_handler.seek(offset, whence)
158+
159+
for _ in range(limit):
160+
line = self._log_handler.readline()
161+
if not line:
163162
break
164-
finally:
163+
fn.write(line)
164+
except:
165165
return
166166

167167
def tail(self, length=3000):
168168
if not self._log_handler:
169169
self._log_handler = open(self._out)
170170

171-
self._log_handler.seek(0, 2)
172-
ed = self._log_handler.tell()
171+
try:
172+
self._log_handler.seek(0, 2)
173+
ed = self._log_handler.tell()
174+
except:
175+
pass
173176

174177
if ed > length:
175178
self.logs(offset=ed - length, whence=0)

python/paddle/distributed/launch/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def launch():
4040
4141
- ``--rank``: The rank of the node, can be auto assigned by master. Default ``--rank=-1``.
4242
43-
- ``--log_level``: The log level to set for logging.setLevel which can be CRITICAL/ERROR/WARNING/INFO/DEBUG/NOTSET, case insensitive. The rank 0 log will not print in the terminal by default, while you can enable it by adding --log_level=debug. Default ``--log_level=INFO``.
43+
- ``--log_level``: The log level to set for logging.setLevel which can be CRITICAL/ERROR/WARNING/INFO/DEBUG/NOTSET, case insensitive. Default ``--log_level=INFO``.
4444
4545
- ``--nnodes``: The number of nodes for a distributed job, it can be a range in elastic mode, e.g., ``--nnodes=2:3``. Default ``--nnodes=1``.
4646

0 commit comments

Comments
 (0)