PaddlePaddle
diff --git a/‎paddle/fluid/distributed/fleet_executor/dist_model.cc‎
Lines changed: 57 additions & 0 deletions b/‎paddle/fluid/distributed/fleet_executor/dist_model.cc‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/fleet_executor/dist_model.h‎
Lines changed: 12 additions & 0 deletions b/‎paddle/fluid/distributed/fleet_executor/dist_model.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/fleet_executor/fleet_executor.cc‎
Lines changed: 7 additions & 0 deletions b/‎paddle/fluid/distributed/fleet_executor/fleet_executor.cc‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/fleet_executor/fleet_executor.h‎
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/distributed/fleet_executor/fleet_executor.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/distributed/fleet_executor/task_node.cc‎
Lines changed: 10 additions & 0 deletions b/‎paddle/fluid/distributed/fleet_executor/task_node.cc‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/fleet_executor/task_node.h‎
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/distributed/fleet_executor/task_node.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/framework/operator.cc‎
Lines changed: 85 additions & 8 deletions b/‎paddle/fluid/framework/operator.cc‎
Lines changed: 85 additions & 8 deletions
diff --git a/‎paddle/fluid/framework/operator.h‎
Lines changed: 8 additions & 0 deletions b/‎paddle/fluid/framework/operator.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎paddle/fluid/framework/pten_utils.cc‎
Lines changed: 25 additions & 7 deletions b/‎paddle/fluid/framework/pten_utils.cc‎
Lines changed: 25 additions & 7 deletions
@@ -15,6 +15,8 @@
 #include <glog/logging.h>
 
 #include "paddle/fluid/distributed/fleet_executor/dist_model.h"
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
@@ -68,9 +70,15 @@ bool DistModel::Init() {
  program_.reset(config_.program_desc);
  scope_.reset(config_.scope);
  }
+ if (!PrepareFeedAndFetch()) {
+ return false;
+ }
  if (!CommInit()) {
  return false;
  }
+ if (!PrepareFleetExe()) {
+ return false;
+ }
  return true;
 }
 
@@ -298,6 +306,55 @@ bool DistModel::LoadParameters() {
  return true;
 }
 
+bool DistModel::PrepareFleetExe() {
+ task_node_.reset(new TaskNode(program_.get(), config_.local_rank));
+ if (config_.local_rank - config_.mp_degree >= 0) {
+ task_node_->AddUpstreamTask(config_.local_rank - config_.mp_degree);
+ }
+ if (config_.local_rank + config_.mp_degree < config_.nranks) {
+ task_node_->AddDownstreamTask(config_.local_rank + config_.mp_degree);
+ }
+ task_node_->SetType("Compute");
+ task_node_->Init();
+ executor_desc_ = FleetExecutorDesc();
+ executor_desc_.set_cur_rank(config_.local_rank);
+ std::unordered_map<int64_t, int64_t> id_to_rank;
+ for (int i = 0; i < config_.nranks; ++i) {
+ RankInfo *rank_info = executor_desc_.add_cluster_info();
+ rank_info->set_rank(i);
+ rank_info->set_ip_port(config_.trainer_endpoints[i]);
+ id_to_rank.insert({i, i});
+ }
+ fleet_exe.reset(new FleetExecutor(executor_desc_));
+ fleet_exe->Init("inference", *(program_.get()), scope_.get(), place_, 1,
+ {task_node_.get()}, id_to_rank);
+ return true;
+}
+
+bool DistModel::PrepareFeedAndFetch() {
+ for (auto *op : program_->Block(0).AllOps()) {
+ if (op->Type() == "feed") {
+ VLOG(3) << "feed op with feed var: " << op->Output("Out")[0];
+ int idx = BOOST_GET_CONST(int, op->GetAttr("col"));
+ if (feeds_.size() <= static_cast<size_t>(idx)) {
+ feeds_.resize(idx + 1);
+ }
+ feeds_[idx] = op;
+ feed_names_[op->Output("Out")[0]] = idx;
+ idx_to_feeds_[idx] = op->Output("Out")[0];
+ } else if (op->Type() == "fetch") {
+ VLOG(3) << "fetch op with fetch var: " << op->Input("X")[0];
+ int idx = BOOST_GET_CONST(int, op->GetAttr("col"));
+ if (fetches_.size() <= static_cast<size_t>(idx)) {
+ fetches_.resize(idx + 1);
+ }
+ fetches_[idx] = op;
+ id_to_fetches_[idx] = op->Input("X")[0];
+ }
+ }
+ return true;
+}
+
 void DistModel::Run(const std::vector<paddle::framework::Tensor> &input_data,
  std::vector<paddle::framework::Tensor> *output_data) {
  /* TODO(fleet exe dev): implement this funct */
 
@@ -32,6 +32,9 @@ class BlockDesc;
 
 namespace distributed {
 
+class TaskNode;
+class FleetExecutor;
+
 struct DistModelConfig {
  std::string model_dir{};
  framework::ProgramDesc* program_desc{nullptr};
@@ -66,12 +69,21 @@ class DistModel {
  bool LoadParameters();
  bool PreparePlace();
  bool CommInit();
+ bool PrepareFeedAndFetch();
+ bool PrepareFleetExe();
  void InsertCommOp(std::string tmp_var_name, int nranks, int rank,
  const std::vector<std::string>& peer_endpoints,
  framework::BlockDesc* block, int ring_id);
 
+ std::vector<framework::OpDesc*> feeds_;
+ std::map<std::string, int64_t> feed_names_;
+ std::map<int64_t, std::string> idx_to_feeds_;
+ std::vector<framework::OpDesc*> fetches_;
+ std::map<int64_t, std::string> id_to_fetches_;
  DistModelConfig config_;
  FleetExecutorDesc executor_desc_;
+ std::shared_ptr<FleetExecutor> fleet_exe;
+ std::shared_ptr<TaskNode> task_node_;
  std::shared_ptr<framework::Scope> scope_;
  paddle::platform::Place place_;
  std::shared_ptr<framework::ProgramDesc> program_;
 
@@ -35,6 +35,13 @@ FleetExecutor::FleetExecutor(const std::string& exe_desc_str) {
  InitMessageBus();
 }
 
+FleetExecutor::FleetExecutor(const FleetExecutorDesc& exe_desc)
+ : exe_desc_(exe_desc) {
+ // Message bus will be created and inited only once
+ GlobalVal<MessageBus>::Create();
+ InitMessageBus();
+}
+
 FleetExecutor::~FleetExecutor() {
  for (const auto& carrier_id : carrier_ids_) {
  GlobalMap<std::string, Carrier>::Get(carrier_id)->Release();
 
@@ -36,6 +36,7 @@ class FleetExecutor final {
  public:
  FleetExecutor() = delete;
  explicit FleetExecutor(const std::string& exe_desc_str);
+ explicit FleetExecutor(const FleetExecutorDesc& exe_desc);
  ~FleetExecutor();
  void Init(const std::string& carrier_id,
  const framework::ProgramDesc& program_desc, framework::Scope* scope,
 
@@ -38,6 +38,16 @@ TaskNode::TaskNode(paddle::framework::ProgramDesc* program, int64_t rank,
  task_id_ = task_node_cnt++;
 }
 
+TaskNode::TaskNode(paddle::framework::ProgramDesc* program, int64_t rank)
+ : program_(program), rank_(rank), task_id_(rank) {
+ max_run_times_ = 1;
+ max_slot_nums_ = 1;
+ LOG(INFO)
+ << "Constructing TaskNode for DistModelInf. The TaskNode's id is: "
+ << rank
+ << ". And the TaskNode's max_run_time and max_slot_num will be set to 1.";
+}
+
 void TaskNode::SetProgram(paddle::framework::ProgramDesc* program) {
  program_ = program;
 }
 
@@ -42,6 +42,7 @@ class TaskNode final {
  int64_t max_slot_nums);
  TaskNode(paddle::framework::ProgramDesc* program, int64_t rank,
  int64_t max_run_times, int64_t max_slot_nums);
+ TaskNode(paddle::framework::ProgramDesc* program, int64_t rank);
  ~TaskNode() = default;
 
  void SetProgram(paddle::framework::ProgramDesc* program);
 
@@ -1192,9 +1192,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  platform::EventRole::kInnerOp);
  if (run_pten_kernel_) {
  pten::KernelContext pt_kernel_context;
+ // Do data transform before building KernelContext
+ PreparePtenData(exec_scope, *pt_kernel_, *pt_kernel_signature_,
+ runtime_ctx);
  BuildPtenKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
  (*pt_kernel_)(&pt_kernel_context);
- WriteBackToOutputs(runtime_ctx, &pt_kernel_context);
  } else {
  (*kernel_func_)(
  ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
@@ -1786,6 +1788,62 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
  pten::TransToPtenKernelName(Type()));
 }
 
+Scope* OperatorWithKernel::PreparePtenData(
+ const Scope& scope, const pten::Kernel& pt_kernel,
+ const KernelSignature& pt_kernel_signature, RuntimeContext* ctx) const {
+ auto& input_names = std::get<0>(pt_kernel_signature.args);
+ auto input_defs = pt_kernel.args_def().input_defs();
+ PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+ platform::errors::InvalidArgument(
+ "The size of inputs_args names (%d) must be equal to "
+ "the size of kernel input_defs (%d).",
+ input_names.size(), input_defs.size()));
+ Scope* new_scope = nullptr;
+ for (size_t i = 0; i < input_defs.size(); ++i) {
+ auto& in_def = input_defs.at(i);
+ auto& ins_vector = ctx->inputs.at(input_names[i]);
+ for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+ // Only tensor can be tranfer to another device.
+ auto* var = ins_vector[offset];
+ if (var == nullptr || !VarIsTensor(*var)) {
+ continue;
+ }
+
+ auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+ if (!tensor_in->IsInitialized()) {
+ continue;
+ }
+
+ auto expected_place = pten::TransToFluidPlace(in_def.backend);
+ if (platform::is_same_place(tensor_in->place(), expected_place)) {
+ continue;
+ }
+
+ // TODO(zyfncg): Now there is no kernel which need to transform input
+ // data, so we commented out following code temporarily,
+ // and it will be used in the future.
+
+ // VLOG(3) << "PTen Transform Variable " << input_names[i] << " from "
+ // << tensor_in->place() << " to " << expected_place;
+
+ // if (!new_scope) {
+ // new_scope = &scope.NewScope();
+ // }
+
+ // // Create new var with the same name in transfer scopes
+ // auto* trans_var = new_scope->Var(input_names[i]);
+ // ins_vector[i] = trans_var;
+
+ // // Do transfer
+ // Tensor out;
+ // framework::TensorCopySync(*tensor_in, expected_place, &out);
+ // SetTensorToVariable(*var, out, trans_var);
+ }
+ }
+
+ return new_scope;
+}
+
 void OperatorWithKernel::BuildPtenKernelContext(
  const RuntimeContext& ctx, platform::DeviceContext* dev_ctx,
  pten::KernelContext* pt_kernel_context) const {
@@ -1818,7 +1876,6 @@ void OperatorWithKernel::BuildPtenKernelContext(
  attr_names.size(), attr_defs.size()));
 
  for (size_t i = 0; i < input_names.size(); ++i) {
- auto& in_def = input_defs.at(i);
  auto& ins_vector = ctx.inputs.at(input_names[i]);
 
  // calcute the start and end index of the input tensors
@@ -1827,24 +1884,44 @@ void OperatorWithKernel::BuildPtenKernelContext(
  size_t end_idx = start_idx + ins_vector.size();
 
  for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
- pt_kernel_context->EmplaceBackInputWithoutSetRange(
- experimental::MakePtenTensorBaseFromVar(*ins_vector[offset], in_def));
+ const framework::Tensor* tensor_in = nullptr;
+ auto* var = ins_vector[offset];
+ if (var->IsType<framework::LoDTensor>()) {
+ tensor_in = &(var->Get<framework::LoDTensor>());
+ } else {
+ PADDLE_THROW(platform::errors::Unimplemented(
+ "Unsupported input `%s` type when call pt kernel.",
+ framework::ToTypeName(var->Type())));
+ } // TODO(zyfncg): Add support for SelectedRows
+
+ pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
  }
  pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
  }
 
  for (size_t i = 0; i < output_names.size(); ++i) {
- auto& out_def = output_defs.at(i);
  auto& outs_vector = ctx.outputs.at(output_names[i]);
 
  size_t start_idx =
  (i == 0 ? 0 : pt_kernel_context->OutputRangeAt(i - 1).second);
  size_t end_idx = start_idx + outs_vector.size();
 
  for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
- pt_kernel_context->EmplaceBackOutputWithoutSetRange(
- experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
- out_def));
+ framework::Tensor* tensor_out = nullptr;
+ auto* var = outs_vector[offset];
+ if (var->template IsType<framework::LoDTensor>()) {
+ tensor_out = var->template GetMutable<framework::LoDTensor>();
+ } else {
+ PADDLE_THROW(platform::errors::Unimplemented(
+ "Unsupported output `%s` type when call pt kernel.",
+ framework::ToTypeName(var->Type())));
+ } // TODO(zyfncg): Add support for SelectedRows
+
+ experimental::ResetTensorByArgDef(tensor_out, output_defs.at(i));
+ SetAllocationForOutputTenosr(
+ tensor_out, pten::TransToFluidPlace(output_defs.at(i).backend));
+
+ pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
  }
 
  // Deal with the case that some outputs are NULL when run the kernel.
 
@@ -588,6 +588,14 @@ class OperatorWithKernel : public OperatorBase {
  /* member functions for adapting to pten lib */
  void ChoosePtenKernel(const ExecutionContext& ctx) const;
 
+ /**
+ * Transfer data place for pten kernel
+ * Is this really needed?
+ */
+ Scope* PreparePtenData(const Scope& scope, const pten::Kernel& pt_kernel,
+ const KernelSignature& pt_kernel_signature,
+ RuntimeContext* ctx) const;
+
  void BuildPtenKernelContext(const RuntimeContext& ctx,
  platform::DeviceContext* dev_ctx,
  pten::KernelContext* pt_kernel_context) const;
 
@@ -137,17 +137,17 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() {
  auto& in = op_proto_->inputs()[i];
  auto& in_name = in.name();
  if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
- VLOG(3) << "Parse PtenKernel input: skip extra & quant input - "
+ VLOG(6) << "Parse PtenKernel input: skip extra & quant input - "
  << in_name;
  continue;
  }
  // If contains dispensable input, we should override the
  // GetExpectedPtenKernelArgs method self
  if (in.has_dispensable() && in.dispensable()) {
- VLOG(3) << "Parse PtenKernel input: skip dispensable input - " << in_name;
+ VLOG(6) << "Parse PtenKernel input: skip dispensable input - " << in_name;
  continue;
  }
- VLOG(3) << "Parse PtenKernel input: " << in_name;
+ VLOG(6) << "Parse PtenKernel input: " << in_name;
  input_names_.emplace_back(in_name);
  }
  return input_names_;
@@ -159,7 +159,7 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() {
  auto& out = op_proto_->outputs()[i];
  auto& out_name = out.name();
  // TODO(chenweihang): outputs also need skip some cases
- VLOG(3) << "Parse PtenKernel output: " << out_name;
+ VLOG(6) << "Parse PtenKernel output: " << out_name;
  output_names_.emplace_back(out_name);
  }
  return output_names_;
@@ -173,17 +173,17 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
  if (attr_name == "use_mkldnn" || attr_name == "op_role" ||
  attr_name == "op_role_var" || attr_name == "op_namescope" ||
  attr_name == "op_callstack" || attr_name == "op_device") {
- VLOG(3) << "Parse PtenKernel attribute: skip needless attr - "
+ VLOG(6) << "Parse PtenKernel attribute: skip needless attr - "
  << attr_name;
  continue;
  }
  if ((attr.has_extra() && attr.extra()) ||
  (attr.has_quant() && attr.quant())) {
- VLOG(3) << "Parse PtenKernel attribute: skip extra & quant attr - "
+ VLOG(6) << "Parse PtenKernel attribute: skip extra & quant attr - "
  << attr_name;
  continue;
  }
- VLOG(3) << "Parse PtenKernel attribute: " << attr_name;
+ VLOG(6) << "Parse PtenKernel attribute: " << attr_name;
  attr_names_.emplace_back(attr_name);
  }
 
@@ -196,5 +196,23 @@ KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
  GetOutputArgsNames());
 }
 
+void SetAllocationForOutputTenosr(pten::DenseTensor* tensor,
+ const platform::Place& place) {
+ if (!tensor->IsInitialized() || !(tensor->place() == place)) {
+ int dtype_size = tensor->dtype() == DataType::UNDEFINED
+ ? 0
+ : experimental::SizeOf(tensor->dtype());
+ int64_t numels = product(tensor->dims());
+ numels = numels < 0 ? 0 : numels;
+ auto tmp_allocation_ptr = memory::Alloc(place, numels * dtype_size);
+ auto& deleter = tmp_allocation_ptr.get_deleter();
+ auto* allocation_ptr = tmp_allocation_ptr.release();
+ auto shared_allocation =
+ std::shared_ptr<pten::Allocation>(allocation_ptr, deleter);
+
+ tensor->ResetHolder(shared_allocation);
+ }
+}
+
 } // namespace framework
 } // namespace paddle