File tree Expand file tree Collapse file tree 2 files changed +8
-0
lines changed
paddle/fluid/distributed/fleet_executor Expand file tree Collapse file tree 2 files changed +8
-0
lines changed Original file line number Diff line number Diff line change @@ -75,6 +75,11 @@ class Carrier final {
7575
7676 bool IsInit () const ;
7777
78+ // NOTE: This mutex will be used in interceptor's RunOps function.
79+ // This mutex is used for avoiding forward ops and backward ops run
80+ // simultaneously, which will lead to a random hang for some sync ops.
81+ std::mutex run;
82+
7883 DISABLE_COPY_AND_ASSIGN (Carrier);
7984
8085 private:
Original file line number Diff line number Diff line change 1313// limitations under the License.
1414
1515#include " paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
16+ #include " paddle/fluid/distributed/fleet_executor/carrier.h"
1617
1718#include " paddle/fluid/distributed/fleet_executor/task_node.h"
1819#include " paddle/fluid/framework/executor_gc_helper.h"
@@ -169,6 +170,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
169170}
170171
171172void ComputeInterceptor::RunOps () {
173+ Carrier& carrier_instance = Carrier::Instance ();
174+ std::unique_lock<std::mutex> lock (carrier_instance.run );
172175 VLOG (3 ) << " ComputeInterceptor " << interceptor_id_ << " running ops for the "
173176 << step_ + 1 << " time." ;
174177 for (auto op : node_->ops ()) {
You can’t perform that action at this time.
0 commit comments