PaddlePaddle
diff --git a/‎cmake/external/lite.cmake‎
Lines changed: 5 additions & 13 deletions b/‎cmake/external/lite.cmake‎
Lines changed: 5 additions & 13 deletions
diff --git a/‎paddle/fluid/distributed/collective/process_group_custom.cc‎
Lines changed: 12 additions & 0 deletions b/‎paddle/fluid/distributed/collective/process_group_custom.cc‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/collective/process_group_custom.h‎
Lines changed: 4 additions & 3 deletions b/‎paddle/fluid/distributed/collective/process_group_custom.h‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎paddle/fluid/eager/amp_utils.h‎
Lines changed: 32 additions & 80 deletions b/‎paddle/fluid/eager/amp_utils.h‎
Lines changed: 32 additions & 80 deletions
diff --git a/‎paddle/fluid/eager/pylayer/py_layer_node.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/eager/pylayer/py_layer_node.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/details/all_reduce_op_handle.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/details/all_reduce_op_handle.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/details/eager_deletion_op_handle.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/details/eager_deletion_op_handle.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/details/fetch_async_op_handle.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/details/fetch_async_op_handle.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/details/multi_devices_helper.h‎
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/framework/details/multi_devices_helper.h‎
Lines changed: 2 additions & 2 deletions
@@ -84,7 +84,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
  if(WITH_ARM)
  set(LITE_BUILD_COMMAND ${CMAKE_COMMAND} --build . --target
  publish_inference -j)
- message(WARNING "BUILD_COMMAND: ${LITE_BUILD_COMMAND}")
+ message(STATUS "BUILD_COMMAND: ${LITE_BUILD_COMMAND}")
  set(LITE_OPTIONAL_ARGS
  -DWITH_MKL=OFF
  -DLITE_WITH_CUDA=OFF
@@ -120,11 +120,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
  ${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake
  UPDATE_COMMAND ""
  BUILD_COMMAND ${LITE_BUILD_COMMAND}
- INSTALL_COMMAND
- cp ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.cc
- ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/core/ && cp
- ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
- ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/core/
+ INSTALL_COMMAND ""
  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
  -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
  -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS}
@@ -141,6 +137,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
  else()
  set(LITE_BUILD_COMMAND ${CMAKE_COMMAND} --build . --target
  publish_inference -j)
+ message(STATUS "BUILD_COMMAND: ${LITE_BUILD_COMMAND}")
  set(LITE_OPTIONAL_ARGS
  -DWITH_MKL=ON
  -DLITE_WITH_CUDA=OFF
@@ -173,11 +170,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
  "s?NNadapter_bridges_path = os.path.abspath('..')+\"\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?NNadapter_bridges_path = os.path.abspath(\'..\')+\"\/extern_lite\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?"
  ${LITE_PREFIX_DIR}/src/extern_lite//lite/tools/cmake_tools/record_supported_kernel_op.py
  BUILD_COMMAND ${LITE_BUILD_COMMAND}
- INSTALL_COMMAND
- cp ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.cc
- ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/core/ && cp
- ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
- ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/core/
+ INSTALL_COMMAND ""
  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
  -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
  -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS}
@@ -196,8 +189,7 @@ endif()
 
 message(STATUS "Paddle-lite BINARY_DIR: ${LITE_BINARY_DIR}")
 message(STATUS "Paddle-lite SOURCE_DIR: ${LITE_SOURCE_DIR}")
-include_directories(${LITE_SOURCE_DIR})
-include_directories(${LITE_BINARY_DIR})
+include_directories(${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/include)
 if(LITE_WITH_XPU)
  include_directories(${LITE_BINARY_DIR}/third_party/install/xpu/xdnn/include/)
  include_directories(${LITE_BINARY_DIR}/third_party/install/xpu/xre/include/)
 
@@ -100,6 +100,18 @@ bool ProcessGroupCustom::CustomTask::Wait(std::chrono::milliseconds timeout) {
 // Same as Wait
 void ProcessGroupCustom::CustomTask::Synchronize() { Wait(kWaitTimeout); }
 
+void ProcessGroupCustom::CustomTask::UpdateWaitChain(
+ const phi::DeviceContext& ctx) {
+ PADDLE_ENFORCE_NE(
+ std::find(places_.cbegin(), places_.cend(), ctx.GetPlace()),
+ places_.cend(),
+ phi::errors::NotFound("Cannot find the device context in this task."));
+ auto index = std::find(places_.cbegin(), places_.cend(), ctx.GetPlace()) -
+ places_.cbegin();
+ control_events_[index].Record(
+ reinterpret_cast<const phi::CustomContext&>(ctx));
+}
+
 ProcessGroupCustom::ProcessGroupCustom(
  const std::shared_ptr<phi::distributed::Store>& store,
  const std::string& device_type,
 
@@ -46,10 +46,11 @@ class ProcessGroupCustom : public ProcessGroupWithoutStream {
  CommType CommType,
  const std::vector<phi::DenseTensor>& inputs);
 
- bool IsCompleted();
+ bool IsCompleted() override;
  void SynchronizeStreams();
- bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
- void Synchronize();
+ bool Wait(std::chrono::milliseconds timeout = kWaitTimeout) override;
+ void Synchronize() override;
+ void UpdateWaitChain(const phi::DeviceContext& ctx) override;
  void SetOutputs(std::vector<phi::DenseTensor>& outputs); // NOLINT
  virtual ~CustomTask();
 
 
@@ -122,90 +122,42 @@ inline paddle::experimental::DataType GetAmpDestDtype(
  const std::string& op_name,
  const paddle::small_vector<std::vector<paddle::Tensor>,
  kSlotSmallVectorSize>& amp_tensors_vector) {
- auto amp_dtype =
- egr::Controller::Instance().GetCurrentTracer()->GetAmpDtype();
  auto amp_level = egr::Controller::Instance().GetAMPLevel();
- VLOG(6) << "AMP GetAmpDestDtype:"
- << " op(" << op_name << ") amp_dtype(" << amp_dtype << ") amp_level("
- << static_cast<int>(amp_level) << ").";
- auto return_amp_type = paddle::experimental::DataType::FLOAT16;
-
- if (amp_dtype == "float16") {
- if (amp_level == paddle::imperative::AmpLevel::O1) {
- if (paddle::imperative::AmpOperators::Instance()
- .GetMutableAllowOps()
- ->count(op_name)) {
- return_amp_type = paddle::experimental::DataType::FLOAT16;
- } else if (paddle::imperative::AmpOperators::Instance()
- .GetMutableBlockOps()
- ->count(op_name) ||
- paddle::imperative::AmpOperators::Instance()
- .GetMutableUnsupportedFp16Ops()
- ->count(op_name)) {
- return_amp_type = paddle::experimental::DataType::FLOAT32;
- } else {
- auto dst_type = GetPromoteType(op_name,
- amp_tensors_vector,
- paddle::experimental::DataType::FLOAT16);
- if (dst_type == paddle::experimental::DataType::FLOAT16 &&
- paddle::imperative::AmpOperators::Instance()
- .GetMutableUnsupportedFp16Ops()
- ->count(op_name)) {
- dst_type = paddle::experimental::DataType::FLOAT32;
- }
- return_amp_type = dst_type;
- }
- } else if (amp_level == paddle::imperative::AmpLevel::O2) {
- auto dst_type = paddle::experimental::DataType::FLOAT16;
- if (paddle::imperative::AmpOperators::Instance()
- .GetMutableUnsupportedFp16Ops()
- ->count(op_name) ||
- paddle::imperative::AmpOperators::Instance()
- .GetMutableBlockOps()
- ->count(op_name)) {
- dst_type = paddle::experimental::DataType::FLOAT32;
- }
- return_amp_type = dst_type;
+ auto amp_setting_dtype =
+ egr::Controller::Instance().GetCurrentTracer()->GetAmpPhiDtype();
+ auto dst_type = amp_setting_dtype;
+ if (amp_level == paddle::imperative::AmpLevel::O1) {
+ if (paddle::imperative::AmpOperators::Instance()
+ .GetMutableAllowOps()
+ ->count(op_name)) {
+ dst_type = amp_setting_dtype;
+ } else if (paddle::imperative::AmpOperators::Instance()
+ .GetMutableBlockOps()
+ ->count(op_name)) {
+ dst_type = paddle::experimental::DataType::FLOAT32;
+ } else {
+ dst_type = GetPromoteType(op_name, amp_tensors_vector, amp_setting_dtype);
  }
- } else if (amp_dtype == "bfloat16") {
- if (amp_level == paddle::imperative::AmpLevel::O1) {
- if (paddle::imperative::AmpOperators::Instance()
- .GetMutableAllowOps()
- ->count(op_name)) {
- return_amp_type = paddle::experimental::DataType::BFLOAT16;
- } else if (paddle::imperative::AmpOperators::Instance()
- .GetMutableBlockOps()
- ->count(op_name)) {
- return_amp_type = paddle::experimental::DataType::FLOAT32;
- } else {
- auto dst_type =
- GetPromoteType(op_name,
- amp_tensors_vector,
- paddle::experimental::DataType::BFLOAT16);
- if (dst_type == paddle::experimental::DataType::BFLOAT16 &&
- paddle::imperative::AmpOperators::Instance()
- .GetMutableUnsupportedBf16Ops()
- ->count(op_name)) {
- dst_type = paddle::experimental::DataType::FLOAT32;
- }
- return_amp_type = dst_type;
- }
- } else if (amp_level == paddle::imperative::AmpLevel::O2) {
- auto dst_type = paddle::experimental::DataType::BFLOAT16;
- if (paddle::imperative::AmpOperators::Instance()
- .GetMutableUnsupportedBf16Ops()
- ->count(op_name) ||
- paddle::imperative::AmpOperators::Instance()
- .GetMutableBlockOps()
- ->count(op_name)) {
- dst_type = paddle::experimental::DataType::FLOAT32;
- }
- return_amp_type = dst_type;
+ } else if (amp_level == paddle::imperative::AmpLevel::O2) {
+ if (paddle::imperative::AmpOperators::Instance()
+ .GetMutableBlockOps()
+ ->count(op_name)) {
+ dst_type = paddle::experimental::DataType::FLOAT32;
  }
- } else {
- return_amp_type = paddle::experimental::DataType::FLOAT32;
  }
- return GetDtypeWithPlace(op_name, amp_tensors_vector, return_amp_type);
+
+ if (dst_type == amp_setting_dtype &&
+ (paddle::imperative::AmpOperators::Instance()
+ .GetMutableUnsupportedOps(amp_setting_dtype)
+ ->count(op_name))) {
+ dst_type = paddle::experimental::DataType::FLOAT32;
+ }
+
+ dst_type = GetDtypeWithPlace(op_name, amp_tensors_vector, dst_type);
+ VLOG(6) << "AMP GetAmpDestDtype:"
+ << " op(" << op_name << ") amp_dtype(" << dst_type << ") amp_level("
+ << static_cast<int>(amp_level) << ").";
+ return dst_type;
 }
 
 } // namespace egr
@@ -165,7 +165,7 @@ GradNodePyLayer::operator()(
  this->OutputMeta()[i][0].IsStopGradient(),
  true,
  paddle::platform::errors::InvalidArgument(
- "%s's backward function should not return empyt at %d position.",
+ "%s's backward function should not return empty at %d position.",
  name(),
  i));
  grad_out.push_back({});
 
@@ -135,7 +135,7 @@ void AllReduceOpHandle::AllReduceImpl(
 
  if (i == 0) {
  numel = static_cast<int64_t>(lod_tensor.numel());
- // only enforce place0, we will enforce other palce numel == place0 numel
+ // only enforce place0, we will enforce other place numel == place0 numel
  PADDLE_ENFORCE_GT(
  numel,
  0,
 
@@ -144,7 +144,7 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
  while (cur_count < op_deps->size()) {
  cur_count++;
  auto cur_op = ready_ops->Pop();
- // when execption, get cur_op == nullptr
+ // when exception, get cur_op == nullptr
  if (cur_op == nullptr) {
  std::lock_guard<std::mutex> lock(mutex_);
  exec_op_count_ = op_deps->size();
 
@@ -59,7 +59,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 #endif
  PADDLE_ENFORCE_NOT_NULL(
  event_,
- platform::errors::InvalidArgument("The cuda envet created is NULL."));
+ platform::errors::InvalidArgument("The cuda event created is NULL."));
  }
  }
 #endif
 
@@ -203,7 +203,7 @@ void FetchAsyncOpHandle::FetchMergedLodTensor(
  }
 
  // slice and memcpy
- // for 0D tensor, can't concat eath tensor, stack them. for 1+D tensor, concat
+ // for 0D tensor, can't concat each tensor, stack them. for 1+D tensor, concat
  // them
  int begin = 0;
  int end = 0;
 
@@ -44,8 +44,8 @@ namespace details {
 // all variable in each devices.
 // The outside vector is the device vector. Each element of this vector is a
 // map from variable name to variables. The variables, who have the same name,
-// will have a differsent version. The offset in the
-// `std::vector<VarHandle*>` is the version of varaibles.
+// will have a different version. The offset in the
+// `std::vector<VarHandle*>` is the version of variables.
 typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
  GraphVars;
 constexpr char kGraphVars[] = "vars";
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(`
`59`	`59`	`#endif`
`60`	`60`	`PADDLE_ENFORCE_NOT_NULL(`
`61`	`61`	`event_,`
`62`		`- platform::errors::InvalidArgument("The cuda envet created is NULL."));`
	`62`	`+ platform::errors::InvalidArgument("The cuda event created is NULL."));`
`63`	`63`	`}`
`64`	`64`	`}`
`65`	`65`	`#endif`
Original file line number	Diff line number	Diff line change
`@@ -203,7 +203,7 @@ void FetchAsyncOpHandle::FetchMergedLodTensor(`
`203`	`203`	`}`
`204`	`204`
`205`	`205`	`// slice and memcpy`
`206`		`- // for 0D tensor, can't concat eath tensor, stack them. for 1+D tensor, concat`
	`206`	`+ // for 0D tensor, can't concat each tensor, stack them. for 1+D tensor, concat`
`207`	`207`	`// them`
`208`	`208`	`int begin = 0;`
`209`	`209`	`int end = 0;`