PaddlePaddle
diff --git a/‎paddle/fluid/eager/auto_code_generator/eager_generator.cc‎
Lines changed: 2 additions & 1 deletion b/‎paddle/fluid/eager/auto_code_generator/eager_generator.cc‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/new_executor/data_transfer.cc‎
Lines changed: 12 additions & 5 deletions b/‎paddle/fluid/framework/new_executor/data_transfer.cc‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎paddle/fluid/framework/new_executor/interpretercore.cc‎
Lines changed: 11 additions & 4 deletions b/‎paddle/fluid/framework/new_executor/interpretercore.cc‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎paddle/fluid/framework/new_executor/interpretercore_util.cc‎
Lines changed: 146 additions & 5 deletions b/‎paddle/fluid/framework/new_executor/interpretercore_util.cc‎
Lines changed: 146 additions & 5 deletions
diff --git a/‎paddle/fluid/imperative/basic_engine.cc‎
Lines changed: 3 additions & 0 deletions b/‎paddle/fluid/imperative/basic_engine.cc‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎paddle/fluid/memory/allocation/allocator_facade.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/memory/allocation/allocator_facade.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/operators/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎paddle/fluid/operators/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎paddle/fluid/operators/detection/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/operators/detection/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -2653,7 +2653,8 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path,
  "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
  "#include \"paddle/fluid/eager/amp_utils.h\"\n"
  "#include \"paddle/fluid/eager/amp_auto_cast.h\"\n"
- "#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n";
+ "#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n"
+ "#pragma GCC diagnostic ignored \"-Wunused-variable\"\n\n";
  std::string forward_cc_include_str =
  paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);
  std::ofstream forward_cc_stream(forward_cc_path, std::ios::out);
 
@@ -319,6 +319,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
  }
  }
 
+ bool transfered = false;
  DataTranferHelper data_transfer_helper(place, var_scope);
  for (auto& var_name_item : *ins_map_temp) {
  bool should_skip_input =
@@ -334,6 +335,9 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
  if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>()) {
  tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
  } else if (var->IsType<LoDTensorArray>()) {
+ if (var->Get<LoDTensorArray>().size() == 0) {
+ continue;
+ }
  tensor_in =
  static_cast<const Tensor*>(&(var->Get<LoDTensorArray>()[0]));
  } else {
@@ -389,6 +393,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
  }
 
  if (is_transferred) {
+ transfered = true;
  // update RuntimeContext.inputs and original op_func_node inputs
  op_func_node->input_index[var_name_item.first][i] =
  var_scope->VarId(new_var_name);
@@ -426,11 +431,13 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
  }
  }
 
- // NOTE(zhiqiu): UPDATE the corresponding OeratorBase to make it consistent
- // with instruction. (hot fix, it is not good design here)
- op_func_node->operator_base_ =
- std::shared_ptr<OperatorBase>(framework::OpRegistry::CreateOp(
- op_base->Type(), new_ins, new_outs, op_base->Attrs()));
+ if (transfered) {
+ // NOTE(zhiqiu): UPDATE the corresponding OeratorBase to make it consistent
+ // with instruction. (hot fix, it is not good design here)
+ op_func_node->operator_base_ =
+ std::shared_ptr<OperatorBase>(framework::OpRegistry::CreateOp(
+ op_base->Type(), new_ins, new_outs, op_base->Attrs()));
+ }
  op_func_node->no_data_transform_index = std::move(no_data_transform_index);
 }
 
 
@@ -300,8 +300,16 @@ void InterpreterCore::Convert(
  gc_event_.emplace_back(vec_instruction_[i].DeviceContext().GetPlace(),
  platform::GenerateDeviceEventFlag());
  }
+ bool inplaced = false;
+ for (auto inst : vec_instruction_) {
+ if (inst.OpBase()->Type() == "share_buffer" ||
+ inst.OpBase()->Type() == "share_data") {
+ VLOG(4) << "Already inplaced, skip inplace now.";
+ inplaced = true;
+ }
+ }
 
- if (FLAGS_new_executor_use_inplace) {
+ if (FLAGS_new_executor_use_inplace && !inplaced) {
  BuildInplace();
  }
 
@@ -565,12 +573,11 @@ void InterpreterCore::RunNextInstructions(
  const Instruction& instr, std::queue<size_t>* reserved_next_ops,
  std::vector<std::atomic<size_t>>* atomic_deps,
  std::vector<std::atomic<size_t>>* atomic_var_ref) {
- VLOG(4) << "atomic 1:" << atomic_deps;
  auto& next_instr = instr.NextInstructions();
 
  auto IsReady = [atomic_deps](size_t next_id) {
- VLOG(4) << "atomic:" << atomic_deps << " " << &(*atomic_deps)[next_id]
- << " " << next_id;
+ VLOG(4) << "atomic:" << atomic_deps << " op_id: " << next_id
+ << ", remain deps: " << (*atomic_deps)[next_id];
  return (*atomic_deps)[next_id].fetch_sub(1, std::memory_order_relaxed) == 1;
  };
 
 
@@ -428,19 +428,19 @@ void build_op_func_list(const platform::Place& place,
  op_func_node.dev_ctx_ = dev_ctx;
  VLOG(3) << op_with_kernel->Type()
  << " : expected_kernel_key : " << expected_kernel_key;
- auto exec_ctx =
- ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context);
 
  // see OperatorWithKernel::RunImpl in operator.cc for why
  if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
  op->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
  InterpretercoreInferShapeContext infer_shape_ctx(*op, runtime_context);
  // TODO(Aurelius84): In case of control flow ops, they are NOT
- // inheritted
- // from OperatorWithKernel.
+ // inheritted from OperatorWithKernel.
  op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
  }
 
+ auto exec_ctx =
+ ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context);
+
  auto run_phi_kernel = false;
  if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(
  op_with_kernel->Type())) {
@@ -476,7 +476,6 @@ void build_op_func_list(const platform::Place& place,
  op_with_kernel->BuildPhiKernelContext(runtime_context, dev_ctx,
  &pt_kernel_context);
  op_func_node.pt_kernel_ = op_with_kernel->PhiKernel();
-
  (*op_func_node.pt_kernel_)(&pt_kernel_context);
  } else {
  auto kernels_iter = all_op_kernels.find(op->Type());
@@ -711,6 +710,7 @@ std::map<int, std::list<int>> build_op_downstream_map(
  const std::set<std::string> random_op_set = {
  "bernoulli", "poisson", "multinomial", "gaussian_random",
  "uniform_random", "randint", "randperm", "exponential"};
+
  int dependence_op_idx = -1;
  for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
  if (random_op_set.count(vec_instruction[op_idx].OpBase()->Type())) {
@@ -721,6 +721,147 @@ std::map<int, std::list<int>> build_op_downstream_map(
  }
  }
 
+ // add dependency for communication op
+ const std::string communication_op_prefix = "c_";
+ dependence_op_idx = -1;
+ for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
+ if (vec_instruction[op_idx].OpBase()->Type().find(
+ communication_op_prefix) != std::string::npos) {
+ if (dependence_op_idx != -1) {
+ op2dependences[op_idx].insert(dependence_op_idx);
+ }
+ dependence_op_idx = op_idx;
+ }
+ }
+
+ // TODO(zhiqiu): there still some cases not handled
+ // add dependency for c_sync_comm_stream
+
+ // in program, we can add only one c_sync_comm_stream to sync all
+ // communication ops.
+ // c_allreduce_sum(a)
+ // c_allreduce_sum(b)
+ // c_allreduce_sum(c)
+ // c_sync_comm_stream(a)
+ const std::string kSyncComm = "c_sync_comm_stream";
+ dependence_op_idx = -1;
+ for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
+ if (vec_instruction[op_idx].OpBase()->Type() == kSyncComm) {
+ dependence_op_idx = op_idx;
+ } else {
+ if (dependence_op_idx != -1) {
+ VLOG(4) << "Add depend from "
+ << vec_instruction[dependence_op_idx].OpBase()->Type() << " to "
+ << vec_instruction[op_idx].OpBase()->Type();
+ op2dependences[op_idx].insert(dependence_op_idx);
+ }
+ }
+ }
+
+ // add dependency for coalesce_tensor
+ const std::string kCoalesceTensor = "coalesce_tensor";
+ for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
+ if (vec_instruction[op_idx].OpBase()->Type() == kCoalesceTensor) {
+ VLOG(4) << "Add depend for " << kCoalesceTensor << " " << op_idx;
+ auto fused_out = vec_instruction[op_idx].Outputs().at("FusedOutput")[0];
+ auto outputs = vec_instruction[op_idx].Outputs().at("Output");
+
+ auto is_read = [](const Instruction& inst, int var_id) -> bool {
+ for (auto pair : inst.Inputs()) {
+ for (auto item : pair.second) {
+ if (item == var_id) {
+ return true;
+ }
+ }
+ }
+ return false;
+ };
+
+ auto is_write = [](const Instruction& inst, int var_id) -> bool {
+ for (auto pair : inst.Outputs()) {
+ for (auto item : pair.second) {
+ if (item == var_id) {
+ return true;
+ }
+ }
+ }
+ return false;
+ };
+
+ // find first op that reads fused_out
+ auto first_read_fused_out_op = -1;
+ for (auto j = op_idx + 1; j < vec_instruction.size(); ++j) {
+ if (is_read(vec_instruction[j], fused_out)) {
+ first_read_fused_out_op = j;
+ break;
+ }
+ }
+
+ if (UNLIKELY(first_read_fused_out_op == -1)) {
+ VLOG(4) << "No op read FusedOutput";
+ continue;
+ }
+
+ // find ops that write 'outputs' between (op_index,
+ // first_read_fused_out_op)
+ // add depend: them->first_read_fused_out_op
+ for (auto j = op_idx + 1;
+ j < static_cast<size_t>(first_read_fused_out_op); ++j) {
+ for (auto var_id : outputs) {
+ if (is_write(vec_instruction[j], var_id)) {
+ op2dependences[first_read_fused_out_op].insert(j);
+ VLOG(4) << j << " -> " << first_read_fused_out_op;
+ VLOG(4)
+ << "Add depend from " << vec_instruction[j].OpBase()->Type()
+ << " to "
+ << vec_instruction[first_read_fused_out_op].OpBase()->Type();
+ }
+ }
+ }
+
+ // find first op read 'outputs' between (first_read_fused_out_op, end)
+ // add depned: first_read_fused_out_op -> first op that reads 'outputs'
+
+ // special case for consecutive communication ops, for example,
+ // FusedOutput = c_sync_calc_stream(FusedOutput)
+ // FusedOutput= c_allreduce_sum(FusedOutput)
+ // FusedOutput = c_sync_comm_stream(FusedOutput)
+ // we should take the last one to add depned instead of
+ // 'first_read_fused_out_op'
+ size_t target = first_read_fused_out_op;
+ for (size_t j = first_read_fused_out_op + 1; j < vec_instruction.size();
+ ++j) {
+ if (j == target + 1 &&
+ vec_instruction[target].OpBase()->Type().find(
+ communication_op_prefix) != std::string::npos &&
+ vec_instruction[j].OpBase()->Type().find(communication_op_prefix) !=
+ std::string::npos) {
+ VLOG(4) << "Found consecutive communication ops, "
+ << vec_instruction[target].OpBase()->Type() << " -> "
+ << vec_instruction[j].OpBase()->Type();
+ target = j;
+ continue;
+ }
+
+ for (auto var_id : outputs) {
+ if (is_read(vec_instruction[j], var_id)) {
+ op2dependences[j].insert(target);
+ VLOG(4) << target << " -> " << j;
+ VLOG(4) << "Add depend from "
+ << vec_instruction[target].OpBase()->Type() << " to "
+ << vec_instruction[j].OpBase()->Type();
+ }
+ }
+ }
+ }
+ }
+ for (auto pair : op2dependences) {
+ VLOG(10) << pair.first << " Depends on " << pair.second.size();
+ std::ostringstream oss;
+ std::copy(pair.second.begin(), pair.second.end(),
+ std::ostream_iterator<int>(oss, " "));
+ VLOG(10) << oss.str();
+ }
  return std::move(get_downstream_map(op2dependences));
 }
 
 
@@ -30,6 +30,7 @@
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/autotune/switch_autotune.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 DECLARE_bool(sort_sum_gradient);
@@ -645,6 +646,8 @@ void BasicEngine::Execute() {
  Clear();
 
  VLOG(1) << "Backward op number: " << op_num;
+
+ phi::autotune::AutoTuneStatus::Instance().Update();
 }
 
 void BasicEngine::Clear() {
 
@@ -85,7 +85,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
 // NOTE(Ruibiao): This FLAGS is just to be compatibled with
 // the old single-stream CUDA allocator. It will be removed
 // after StreamSafeCudaAllocator has been fully tested.
-PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false,
+PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true,
  "Enable StreamSafeCUDAAllocator");
 
 PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory, false,
 
@@ -102,10 +102,11 @@ endif()
 
 set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel)
 
-register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op
+register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op
  recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
+op_library(quantize_linear_op DEPS cast_kernel)
 op_library(save_combine_op DEPS string_array)
 op_library(load_combine_op DEPS string_array)
 
 
@@ -66,6 +66,7 @@ detection_library(yolo_box_op SRCS yolo_box_op.cc)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
 detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
 detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
+detection_library(nms_op SRCS nms_op.cc nms_op.cu)
 
 if(WITH_GPU OR WITH_ROCM)
  set(TMPDEPS memory)