Skip to content

Commit b4e44b0

Browse files
authored
[fleet_executor] Fix overlap hang (#38024)
1 parent a9bd6f0 commit b4e44b0

File tree

2 files changed

+8
-0
lines changed

2 files changed

+8
-0
lines changed

paddle/fluid/distributed/fleet_executor/carrier.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,11 @@ class Carrier final {
7575

7676
bool IsInit() const;
7777

78+
// NOTE: This mutex will be used in interceptor's RunOps function.
79+
// This mutex is used for avoiding forward ops and backward ops run
80+
// simultaneously, which will lead to a random hang for some sync ops.
81+
std::mutex run;
82+
7883
DISABLE_COPY_AND_ASSIGN(Carrier);
7984

8085
private:

paddle/fluid/distributed/fleet_executor/compute_interceptor.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
// limitations under the License.
1414

1515
#include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
16+
#include "paddle/fluid/distributed/fleet_executor/carrier.h"
1617

1718
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
1819
#include "paddle/fluid/framework/executor_gc_helper.h"
@@ -169,6 +170,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
169170
}
170171

171172
void ComputeInterceptor::RunOps() {
173+
Carrier& carrier_instance = Carrier::Instance();
174+
std::unique_lock<std::mutex> lock(carrier_instance.run);
172175
VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
173176
<< step_ + 1 << " time.";
174177
for (auto op : node_->ops()) {

0 commit comments

Comments
 (0)