Skip to content

Commit a821c4a

Browse files
authored
[PTEN] Add Gpu context (#39305)
1 parent dcff7fa commit a821c4a

File tree

135 files changed

+4438
-770
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

135 files changed

+4438
-770
lines changed

paddle/fluid/distributed/common/utils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ namespace distributed {
3333
template <typename T>
3434
inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
3535
GetBlas() {
36-
auto cpu_ctx = paddle::platform::CPUDeviceContext();
36+
paddle::platform::CPUDeviceContext cpu_ctx;
3737
return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
3838
T>(cpu_ctx);
3939
}

paddle/fluid/distributed/ps/service/communicator/communicator.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
11551155
auto &t_latest = var_latest->Get<framework::LoDTensor>();
11561156
auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
11571157

1158-
auto cpu_ctx = paddle::platform::CPUDeviceContext();
1158+
paddle::platform::CPUDeviceContext cpu_ctx;
11591159
auto *var_delta = delta_scope_->Var(varname);
11601160
auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
11611161
t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
@@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
11851185
RpcRecvDense(varnames, table_id, pserver_scope_.get());
11861186

11871187
// 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
1188-
auto cpu_ctx = paddle::platform::CPUDeviceContext();
1188+
paddle::platform::CPUDeviceContext cpu_ctx;
11891189
for (auto &varname : varnames) {
11901190
auto *var_latest = recv_scope_->FindVar(varname);
11911191
auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
@@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
12921292
auto *t_old = var_old->GetMutable<framework::LoDTensor>();
12931293

12941294
auto dims1 = t_latest.dims()[1];
1295-
auto cpu_ctx = paddle::platform::CPUDeviceContext();
1295+
paddle::platform::CPUDeviceContext cpu_ctx;
12961296

12971297
auto *var_delta = delta_scope_->Var(varname);
12981298
auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
@@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
13701370
std::vector<float> v_delta;
13711371
v_delta.resize(numel);
13721372

1373-
auto cpu_ctx = paddle::platform::CPUDeviceContext();
1373+
paddle::platform::CPUDeviceContext cpu_ctx;
13741374
auto blas =
13751375
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
13761376
cpu_ctx);

paddle/fluid/distributed/ps/service/communicator/communicator.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
179179
}
180180

181181
// set output tensor to 0.
182-
auto cpu_ctx = paddle::platform::CPUDeviceContext();
182+
paddle::platform::CPUDeviceContext cpu_ctx;
183183
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
184184
constant_functor;
185185
constant_functor(cpu_ctx, out_t, static_cast<T>(0));
@@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
204204
for (auto &var : vars) {
205205
inputs.push_back(&var->Get<pten::SelectedRows>());
206206
}
207-
auto dev_ctx = paddle::platform::CPUDeviceContext();
207+
paddle::platform::CPUDeviceContext dev_ctx;
208208
if (merge_add) {
209209
paddle::operators::math::scatter::MergeAdd<
210210
paddle::platform::CPUDeviceContext, T>

paddle/fluid/framework/data_type_transform_test.cu

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
2121
auto cpu_place = paddle::platform::CPUPlace();
2222
auto gpu_place = paddle::platform::CUDAPlace(0);
2323
paddle::platform::CUDADeviceContext context(gpu_place);
24-
24+
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
25+
.GetAllocator(gpu_place, context.stream())
26+
.get());
27+
context.PartialInitWithAllocator();
2528
auto kernel_fp16 = paddle::framework::OpKernelType(
2629
paddle::framework::proto::VarType::FP16, gpu_place,
2730
paddle::framework::DataLayout::kAnyLayout,

paddle/fluid/framework/parallel_executor.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
13611361
auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
13621362
pool.Get(member_->places_[dev_id]));
13631363
auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
1364-
dev_ctx->set_bkcl_context(bkcl_ctx.comm());
1364+
dev_ctx->SetBkclContext(bkcl_ctx.comm());
13651365
}
13661366
#else
13671367
PADDLE_THROW(

paddle/fluid/framework/pten_utils.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
7777
using TYPE = pten::CPUContext;
7878
};
7979

80+
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
81+
template <>
82+
struct ConvertToPtenContext<platform::CUDADeviceContext> {
83+
using TYPE = pten::GPUContext;
84+
};
85+
#endif
86+
8087
#ifdef PADDLE_WITH_XPU
8188
template <>
8289
struct ConvertToPtenContext<platform::XPUDeviceContext> {

paddle/fluid/framework/tensor_util.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
10851085
is.seekg(seekg, is.cur);
10861086

10871087
void* buf;
1088-
auto ctx = platform::CPUDeviceContext();
1088+
platform::CPUDeviceContext ctx;
10891089
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
10901090
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
10911091
platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
11551155
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
11561156
tensor->Resize(framework::make_ddim(dims));
11571157
void* buf;
1158-
auto ctx = platform::CPUDeviceContext();
1158+
platform::CPUDeviceContext ctx;
11591159
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
11601160
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
11611161
platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
14321432
VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
14331433
return os;
14341434
}
1435-
}
1435+
} // namespace pten

paddle/fluid/framework/tensor_util_test.cc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) {
7373
// CPU Tensor to GPU Tensor
7474
auto gpu_place = new platform::CUDAPlace(0);
7575
platform::CUDADeviceContext gpu_ctx(*gpu_place);
76+
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
77+
.GetAllocator(*gpu_place, gpu_ctx.stream())
78+
.get());
79+
gpu_ctx.PartialInitWithAllocator();
7680
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
7781

7882
// GPU Tensor to CPU Tensor
@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) {
166170
gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
167171
auto gpu_place = new paddle::platform::CUDAPlace();
168172
paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
173+
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
174+
.GetAllocator(*gpu_place, gpu_ctx.stream())
175+
.get());
176+
gpu_ctx.PartialInitWithAllocator();
169177
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
170178
// Copy from GPU to CPU tensor for comparison
171179
paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) {
230238
paddle::framework::Tensor gpu_tensor;
231239
paddle::platform::CUDAPlace place;
232240
paddle::platform::CUDADeviceContext gpu_ctx(place);
241+
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
242+
.GetAllocator(place, gpu_ctx.stream())
243+
.get());
244+
gpu_ctx.PartialInitWithAllocator();
233245
paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
234246

235247
std::vector<int> dst;
@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) {
267279
paddle::framework::Tensor gpu_tensor;
268280
paddle::platform::CUDAPlace place;
269281
paddle::platform::CUDADeviceContext gpu_ctx(place);
282+
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
283+
.GetAllocator(place, gpu_ctx.stream())
284+
.get());
285+
gpu_ctx.PartialInitWithAllocator();
270286
paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
271287

272288
std::vector<bool> dst;
@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) {
493509

494510
auto gpu_place = new platform::CUDAPlace();
495511
platform::CUDADeviceContext gpu_ctx(*gpu_place);
512+
gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
513+
.GetAllocator(*gpu_place, gpu_ctx.stream())
514+
.get());
515+
gpu_ctx.PartialInitWithAllocator();
496516

497517
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
498518

paddle/fluid/imperative/gloo_context.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() {
4646
gloo_wrapper->Init();
4747
device_ = std::unique_ptr<platform::CPUDeviceContext>(
4848
new platform::CPUDeviceContext(platform::CPUPlace()));
49+
device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
50+
.GetAllocator(platform::CPUPlace())
51+
.get());
52+
device_->SetHostAllocator(
53+
paddle::memory::allocation::AllocatorFacade::Instance()
54+
.GetAllocator(paddle::platform::CPUPlace())
55+
.get());
56+
device_->SetZeroAllocator(
57+
paddle::memory::allocation::AllocatorFacade::Instance()
58+
.GetZeroAllocator(platform::CPUPlace())
59+
.get());
4960
}
5061

5162
void GLOOParallelContext::InitWithRingID(int ring_id) {

paddle/fluid/inference/lite/test_engine_lite.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) {
7777
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
7878
platform::CUDAPlace place;
7979
platform::CUDADeviceContext ctx(place);
80+
ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
81+
.GetAllocator(place, ctx.stream())
82+
.get());
83+
ctx.PartialInitWithAllocator();
8084
#else
8185
platform::CPUPlace place;
8286
platform::CPUDeviceContext ctx(place);

0 commit comments

Comments
 (0)