PaddlePaddle
diff --git a/‎paddle/fluid/distributed/common/utils.h‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/distributed/common/utils.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/distributed/ps/service/communicator/communicator.cc‎
Lines changed: 4 additions & 4 deletions b/‎paddle/fluid/distributed/ps/service/communicator/communicator.cc‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎paddle/fluid/distributed/ps/service/communicator/communicator.h‎
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/distributed/ps/service/communicator/communicator.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/framework/data_type_transform_test.cu‎
Lines changed: 4 additions & 1 deletion b/‎paddle/fluid/framework/data_type_transform_test.cu‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/parallel_executor.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/parallel_executor.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/pten_utils.h‎
Lines changed: 7 additions & 0 deletions b/‎paddle/fluid/framework/pten_utils.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎paddle/fluid/framework/tensor_util.cc‎
Lines changed: 3 additions & 3 deletions b/‎paddle/fluid/framework/tensor_util.cc‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎paddle/fluid/framework/tensor_util_test.cc‎
Lines changed: 20 additions & 0 deletions b/‎paddle/fluid/framework/tensor_util_test.cc‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎paddle/fluid/imperative/gloo_context.cc‎
Lines changed: 11 additions & 0 deletions b/‎paddle/fluid/imperative/gloo_context.cc‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/lite/test_engine_lite.cc‎
Lines changed: 4 additions & 0 deletions b/‎paddle/fluid/inference/lite/test_engine_lite.cc‎
Lines changed: 4 additions & 0 deletions
@@ -33,7 +33,7 @@ namespace distributed {
 template <typename T>
 inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
 GetBlas() {
- auto cpu_ctx = paddle::platform::CPUDeviceContext();
+ paddle::platform::CPUDeviceContext cpu_ctx;
  return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
  T>(cpu_ctx);
 }
 
@@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
  auto &t_latest = var_latest->Get<framework::LoDTensor>();
  auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
 
- auto cpu_ctx = paddle::platform::CPUDeviceContext();
+ paddle::platform::CPUDeviceContext cpu_ctx;
  auto *var_delta = delta_scope_->Var(varname);
  auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
  t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
@@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
  RpcRecvDense(varnames, table_id, pserver_scope_.get());
 
  // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
- auto cpu_ctx = paddle::platform::CPUDeviceContext();
+ paddle::platform::CPUDeviceContext cpu_ctx;
  for (auto &varname : varnames) {
  auto *var_latest = recv_scope_->FindVar(varname);
  auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
@@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
  auto *t_old = var_old->GetMutable<framework::LoDTensor>();
 
  auto dims1 = t_latest.dims()[1];
- auto cpu_ctx = paddle::platform::CPUDeviceContext();
+ paddle::platform::CPUDeviceContext cpu_ctx;
 
  auto *var_delta = delta_scope_->Var(varname);
  auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
@@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
  std::vector<float> v_delta;
  v_delta.resize(numel);
 
- auto cpu_ctx = paddle::platform::CPUDeviceContext();
+ paddle::platform::CPUDeviceContext cpu_ctx;
  auto blas =
  paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
  cpu_ctx);
 
@@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
  }
 
  // set output tensor to 0.
- auto cpu_ctx = paddle::platform::CPUDeviceContext();
+ paddle::platform::CPUDeviceContext cpu_ctx;
  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
  constant_functor;
  constant_functor(cpu_ctx, out_t, static_cast<T>(0));
@@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
  for (auto &var : vars) {
  inputs.push_back(&var->Get<pten::SelectedRows>());
  }
- auto dev_ctx = paddle::platform::CPUDeviceContext();
+ paddle::platform::CPUDeviceContext dev_ctx;
  if (merge_add) {
  paddle::operators::math::scatter::MergeAdd<
  paddle::platform::CPUDeviceContext, T>
 
@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
  auto cpu_place = paddle::platform::CPUPlace();
  auto gpu_place = paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
-
+ context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+ .GetAllocator(gpu_place, context.stream())
+ .get());
+ context.PartialInitWithAllocator();
  auto kernel_fp16 = paddle::framework::OpKernelType(
  paddle::framework::proto::VarType::FP16, gpu_place,
  paddle::framework::DataLayout::kAnyLayout,
 
@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
  auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
  pool.Get(member_->places_[dev_id]));
  auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
- dev_ctx->set_bkcl_context(bkcl_ctx.comm());
+ dev_ctx->SetBkclContext(bkcl_ctx.comm());
  }
 #else
  PADDLE_THROW(
 
@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
  using TYPE = pten::CPUContext;
 };
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+struct ConvertToPtenContext<platform::CUDADeviceContext> {
+ using TYPE = pten::GPUContext;
+};
+#endif
+
 #ifdef PADDLE_WITH_XPU
 template <>
 struct ConvertToPtenContext<platform::XPUDeviceContext> {
 
@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
  is.seekg(seekg, is.cur);
 
  void* buf;
- auto ctx = platform::CPUDeviceContext();
+ platform::CPUDeviceContext ctx;
  size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
  if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
  platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
  std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
  tensor->Resize(framework::make_ddim(dims));
  void* buf;
- auto ctx = platform::CPUDeviceContext();
+ platform::CPUDeviceContext ctx;
  size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
  if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
  platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
  VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
  return os;
 }
-}
+} // namespace pten
@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) {
  // CPU Tensor to GPU Tensor
  auto gpu_place = new platform::CUDAPlace(0);
  platform::CUDADeviceContext gpu_ctx(*gpu_place);
+ gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+ .GetAllocator(*gpu_place, gpu_ctx.stream())
+ .get());
+ gpu_ctx.PartialInitWithAllocator();
  TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 
  // GPU Tensor to CPU Tensor
@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) {
  gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
  auto gpu_place = new paddle::platform::CUDAPlace();
  paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
+ gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+ .GetAllocator(*gpu_place, gpu_ctx.stream())
+ .get());
+ gpu_ctx.PartialInitWithAllocator();
  paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
  // Copy from GPU to CPU tensor for comparison
  paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) {
  paddle::framework::Tensor gpu_tensor;
  paddle::platform::CUDAPlace place;
  paddle::platform::CUDADeviceContext gpu_ctx(place);
+ gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+ .GetAllocator(place, gpu_ctx.stream())
+ .get());
+ gpu_ctx.PartialInitWithAllocator();
  paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
 
  std::vector<int> dst;
@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) {
  paddle::framework::Tensor gpu_tensor;
  paddle::platform::CUDAPlace place;
  paddle::platform::CUDADeviceContext gpu_ctx(place);
+ gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+ .GetAllocator(place, gpu_ctx.stream())
+ .get());
+ gpu_ctx.PartialInitWithAllocator();
  paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
 
  std::vector<bool> dst;
@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) {
 
  auto gpu_place = new platform::CUDAPlace();
  platform::CUDADeviceContext gpu_ctx(*gpu_place);
+ gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+ .GetAllocator(*gpu_place, gpu_ctx.stream())
+ .get());
+ gpu_ctx.PartialInitWithAllocator();
 
  TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 
 
@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() {
  gloo_wrapper->Init();
  device_ = std::unique_ptr<platform::CPUDeviceContext>(
  new platform::CPUDeviceContext(platform::CPUPlace()));
+ device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+ .GetAllocator(platform::CPUPlace())
+ .get());
+ device_->SetHostAllocator(
+ paddle::memory::allocation::AllocatorFacade::Instance()
+ .GetAllocator(paddle::platform::CPUPlace())
+ .get());
+ device_->SetZeroAllocator(
+ paddle::memory::allocation::AllocatorFacade::Instance()
+ .GetZeroAllocator(platform::CPUPlace())
+ .get());
 }
 
 void GLOOParallelContext::InitWithRingID(int ring_id) {
 
@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
+ ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+ .GetAllocator(place, ctx.stream())
+ .get());
+ ctx.PartialInitWithAllocator();
 #else
  platform::CPUPlace place;
  platform::CPUDeviceContext ctx(place);
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ namespace distributed {`
`33`	`33`	`template <typename T>`
`34`	`34`	`inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>`
`35`	`35`	`GetBlas() {`
`36`		`- auto cpu_ctx = paddle::platform::CPUDeviceContext();`
	`36`	`+ paddle::platform::CPUDeviceContext cpu_ctx;`
`37`	`37`	`return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,`
`38`	`38`	`T>(cpu_ctx);`
`39`	`39`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {`
`1361`	`1361`	`auto dev_ctx = static_cast<platform::XPUDeviceContext >(`
`1362`	`1362`	`pool.Get(member_->places_[dev_id]));`
`1363`	`1363`	`auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);`
`1364`		`- dev_ctx->set_bkcl_context(bkcl_ctx.comm());`
	`1364`	`+ dev_ctx->SetBkclContext(bkcl_ctx.comm());`
`1365`	`1365`	`}`
`1366`	`1366`	`#else`
`1367`	`1367`	`PADDLE_THROW(`