Skip to content

Commit 579784e

Browse files
authored
[PHI decouple] move dropout_impl and cuda_graph_with_memory_pool from fluid to phi (#49139)
* move dropout_impl from fluid to phi * move cuda_graph_with_memory_pool from fluid to phi * update namespace * remove cuad_graph in fluid * fix mac-build * fix bugs * correct CodeStyle * fix mac-build * fix mutable_data * fix stl include * fix copy param
1 parent 44973c6 commit 579784e

19 files changed

+285
-280
lines changed

paddle/fluid/memory/allocation/allocator_facade.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
#include "paddle/phi/backends/gpu/gpu_context.h"
4040

4141
#ifdef PADDLE_WITH_CUDA
42-
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
42+
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
4343
#endif
4444

4545
#if CUDA_VERSION >= 10020
@@ -157,7 +157,7 @@ class CUDAGraphAllocator
157157

158158
static bool IsCUDAGraphCapturing() {
159159
#ifdef PADDLE_WITH_CUDA
160-
return UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing());
160+
return UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing());
161161
#else
162162
return false;
163163
#endif
@@ -1007,7 +1007,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
10071007
AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
10081008
#ifdef PADDLE_WITH_CUDA
10091009
if (UNLIKELY(IsCUDAGraphCapturing())) {
1010-
auto id = platform::CUDAGraph::CapturingPoolID();
1010+
auto id = phi::backends::gpu::CUDAGraph::CapturingPoolID();
10111011
auto iter = cuda_graph_map_.find(id);
10121012
PADDLE_ENFORCE_NE(
10131013
iter,

paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
#include "paddle/phi/backends/gpu/gpu_info.h"
2020

2121
#ifdef PADDLE_WITH_CUDA
22-
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
22+
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
2323
#endif
2424

2525
namespace paddle {
@@ -49,7 +49,7 @@ void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) {
4949

5050
std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
5151
#ifdef PADDLE_WITH_CUDA
52-
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
52+
if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) {
5353
graph_capturing_stream_set_.insert(stream);
5454
return;
5555
}
@@ -61,7 +61,7 @@ void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) {
6161

6262
bool StreamSafeCUDAAllocation::CanBeFreed() {
6363
#ifdef PADDLE_WITH_CUDA
64-
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
64+
if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) {
6565
return graph_capturing_stream_set_.empty() &&
6666
outstanding_event_map_.empty();
6767
}

paddle/fluid/memory/stream_safe_cuda_alloc_test.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
319319
data, result, data_num_);
320320
RecordStream(data_allocation, other_stream);
321321

322-
std::unique_ptr<platform::CUDAGraph> cuda_graph =
322+
std::unique_ptr<phi::backends::gpu::CUDAGraph> cuda_graph =
323323
platform::EndCUDAGraphCapture();
324324

325325
int replay_times = 10;

paddle/fluid/operators/cuda_graph_with_in_out.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ class CUDAGraphWithInOuts {
8989
int64_t PoolID() const { return graph_->PoolID(); }
9090

9191
private:
92-
std::unique_ptr<platform::CUDAGraph> graph_;
92+
std::unique_ptr<phi::backends::gpu::CUDAGraph> graph_;
9393
std::vector<phi::DenseTensor> ins_;
9494
std::vector<phi::DenseTensor> outs_;
9595
std::vector<int64_t> in_indices_;

paddle/fluid/operators/fused/fmha_ref.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ limitations under the License. */
1414

1515
#pragma once
1616

17-
#include "paddle/fluid/operators/dropout_impl.cu.h"
1817
#include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h"
1918
#include "paddle/phi/kernels/funcs/broadcast_function.h"
2019
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
20+
#include "paddle/phi/kernels/funcs/dropout_impl.cu.h"
2121
#include "paddle/phi/kernels/funcs/elementwise_base.h"
2222
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
2323
#include "paddle/phi/kernels/funcs/functors.h"
@@ -206,7 +206,7 @@ class FMHARef {
206206
stride_b = gemm_k * gemm_n;
207207

208208
if (dropout_param_.dropout_prob_) {
209-
DropoutFwGPUKernelDriver<T>(
209+
phi::funcs::DropoutFwGPUKernelDriver<T>(
210210
static_cast<const phi::GPUContext&>(dev_ctx_),
211211
dropout_param_.is_test_,
212212
dropout_param_.dropout_prob_,
@@ -381,7 +381,7 @@ class FMHARef {
381381
stride_b = gemm_k * gemm_n;
382382

383383
if (dropout_param_.dropout_prob_) {
384-
DropoutFwGPUKernelDriver<T>(
384+
phi::funcs::DropoutFwGPUKernelDriver<T>(
385385
static_cast<const phi::GPUContext&>(dev_ctx_),
386386
dropout_param_.is_test_,
387387
dropout_param_.dropout_prob_,
@@ -552,7 +552,7 @@ class FMHARef {
552552
}
553553
// dropout bw
554554
if (dropout_param_.dropout_prob_) {
555-
DropoutGradGPUKernelDriver<T>(
555+
phi::funcs::DropoutGradGPUKernelDriver<T>(
556556
static_cast<const phi::GPUContext&>(dev_ctx_),
557557
false,
558558
dropout_param_.dropout_prob_,

paddle/fluid/operators/fused/fused_dropout_helper.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ limitations under the License. */
1515
#pragma once
1616

1717
#include "paddle/fluid/framework/generator.h"
18-
#include "paddle/fluid/operators/dropout_impl_util.h"
1918
#include "paddle/fluid/operators/fused/fused_dropout_act_bias.h"
2019
#include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h"
2120
#include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
21+
#include "paddle/phi/kernels/funcs/dropout_impl_util.h"
2222
#include "paddle/phi/kernels/funcs/functors.h"
2323
#include "paddle/phi/kernels/layer_norm_kernel.h"
2424

@@ -106,7 +106,7 @@ struct DropoutParam {
106106

107107
int UpdateSeedAndIncrement(const phi::GPUContext& ctx, const int offset) {
108108
uint64_t tmp_increment;
109-
GetSeedDataAndIncrement(
109+
phi::funcs::GetSeedDataAndIncrement(
110110
ctx, tensor_seed, fix_seed, seed_val, offset, &seed, &tmp_increment);
111111
increment = static_cast<int>(tmp_increment);
112112
return increment;

paddle/fluid/platform/cuda_graph_with_memory_pool.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,18 @@
1515
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
1616

1717
#include "paddle/fluid/memory/allocation/allocator_facade.h"
18-
#include "paddle/fluid/platform/device_context.h"
18+
#include "paddle/phi/backends/all_context.h"
1919

2020
DECLARE_bool(use_stream_safe_cuda_allocator);
2121

2222
namespace paddle {
2323
namespace platform {
2424

2525
#ifdef PADDLE_WITH_CUDA
26-
void BeginCUDAGraphCapture(platform::CUDAPlace place,
26+
void BeginCUDAGraphCapture(phi::GPUPlace place,
2727
cudaStreamCaptureMode mode,
2828
int64_t pool_id) {
29-
auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place);
29+
auto* mutable_dev_ctx = phi::DeviceContextPool::Instance().Get(place);
3030
auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(mutable_dev_ctx);
3131
dev_ctx->cudnn_workspace_handle().ResetWorkspace();
3232

@@ -64,7 +64,7 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
6464

6565
std::unique_ptr<CUDAGraph> EndCUDAGraphCapture() {
6666
auto place = CUDAGraph::CapturingPlace();
67-
auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place);
67+
auto* mutable_dev_ctx = phi::DeviceContextPool::Instance().Get(place);
6868
auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(mutable_dev_ctx);
6969
dev_ctx->cudnn_workspace_handle().ResetWorkspace();
7070
dev_ctx->SetCUDAGraphAllocator(nullptr);

paddle/fluid/platform/cuda_graph_with_memory_pool.h

Lines changed: 13 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -14,123 +14,38 @@
1414

1515
#pragma once
1616

17-
#include "paddle/fluid/platform/enforce.h"
18-
#include "paddle/fluid/platform/place.h"
19-
#ifdef PADDLE_WITH_CUDA
20-
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
21-
#endif
17+
#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
18+
#include "paddle/phi/common/place.h"
19+
#include "paddle/phi/core/enforce.h"
20+
#include "paddle/phi/core/macros.h"
2221

2322
namespace paddle {
2423
namespace platform {
2524

26-
#ifdef PADDLE_WITH_CUDA
27-
#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond, \
28-
__kernel_func, \
29-
__grid, \
30-
__block, \
31-
__sm_size, \
32-
__stream, \
33-
__seed_inc, \
34-
__seed_expr, \
35-
__offset_expr, \
36-
...) \
37-
do { \
38-
if (::paddle::platform::CUDAGraph::IsThisThreadCapturing() && (__cond)) { \
39-
using __Helper = \
40-
::paddle::platform::IsSameKernelHelper<decltype(&__kernel_func), \
41-
&__kernel_func>; \
42-
auto *dev_ctx = \
43-
::paddle::platform::DeviceContextPool::Instance().GetByPlace( \
44-
::paddle::platform::CUDAGraph::CapturingPlace()); \
45-
auto __set_seed_func = \
46-
[=](::paddle::platform::CUDAKernelParams *__params, \
47-
bool __check_only) -> bool { \
48-
if (__check_only) { \
49-
return __params->func() == &__kernel_func && \
50-
__Helper::Compare(*__params, __VA_ARGS__); \
51-
} \
52-
auto &KERNEL_PARAMS = *__params; \
53-
uint64_t __seed, __offset; \
54-
::paddle::operators::GetSeedDataAndIncrement( \
55-
*dev_ctx, nullptr, false, 0, __seed_inc, &__seed, &__offset); \
56-
__seed_expr = static_cast<decltype(__seed_expr)>(__seed); \
57-
__offset_expr = static_cast<decltype(__offset_expr)>(__offset); \
58-
return true; \
59-
}; \
60-
::paddle::platform::CUDAGraph::RecordRandomKernelInfo(__set_seed_func); \
61-
} \
62-
__kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__); \
63-
} while (0)
64-
#else
65-
#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond, \
66-
__kernel_func, \
67-
__grid, \
68-
__block, \
69-
__sm_size, \
70-
__stream, \
71-
__seed_inc, \
72-
__seed_expr, \
73-
__offset_expr, \
74-
...) \
75-
do { \
76-
__kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__); \
77-
} while (0)
78-
#endif
79-
8025
// NOTE: These APIs are not thread-safe.
8126
#ifdef PADDLE_WITH_CUDA
82-
void BeginCUDAGraphCapture(platform::CUDAPlace place,
27+
using CUDAGraph = phi::backends::gpu::CUDAGraph;
28+
29+
void BeginCUDAGraphCapture(phi::GPUPlace place,
8330
cudaStreamCaptureMode mode,
8431
int64_t pool_id = CUDAGraph::kInvalidPoolID);
8532
std::unique_ptr<CUDAGraph> EndCUDAGraphCapture();
8633
#endif
8734

88-
inline bool IsCUDAGraphCapturing() {
89-
#ifdef PADDLE_WITH_CUDA
90-
return CUDAGraph::IsCapturing();
91-
#else
92-
return false;
93-
#endif
94-
}
95-
96-
inline platform::CUDAPlace CUDAGraphCapturingPlace() {
35+
inline phi::GPUPlace CUDAGraphCapturingPlace() {
9736
#ifdef PADDLE_WITH_CUDA
9837
return CUDAGraph::CapturingPlace();
9938
#else
100-
PADDLE_THROW(platform::errors::Unimplemented(
39+
PADDLE_THROW(phi::errors::Unimplemented(
10140
"CUDA Graph is only supported on NVIDIA GPU device."));
10241
#endif
10342
}
10443

105-
// Add reset callback if CUDA Graph is capturing.
106-
// Otherwise, invoke callback directly.
107-
template <typename Callback>
108-
inline void AddResetCallbackIfCapturingCUDAGraph(Callback &&callback) {
109-
#ifdef PADDLE_WITH_CUDA
110-
if (UNLIKELY(IsCUDAGraphCapturing())) {
111-
return CUDAGraph::AddResetCallbackDuringCapturing(
112-
std::forward<Callback>(callback));
113-
}
114-
#endif
115-
callback();
116-
}
44+
using phi::backends::gpu::IsCUDAGraphCapturing;
11745

118-
template <typename T>
119-
inline T *RestoreHostMemIfCapturingCUDAGraph(T *host_mem, size_t size) {
120-
static_assert(std::is_trivial<T>::value, "T must be trivial type");
121-
static_assert(!std::is_same<T, void>::value, "T cannot be void");
122-
#ifdef PADDLE_WITH_CUDA
123-
if (UNLIKELY(IsCUDAGraphCapturing())) {
124-
size_t nbytes = size * sizeof(T);
125-
void *new_host_mem = new uint8_t[nbytes];
126-
std::memcpy(new_host_mem, host_mem, nbytes);
127-
AddResetCallbackIfCapturingCUDAGraph(
128-
[new_host_mem] { delete[] reinterpret_cast<uint8_t *>(new_host_mem); });
129-
return reinterpret_cast<T *>(new_host_mem);
130-
}
131-
#endif
132-
return host_mem;
133-
}
46+
using phi::backends::gpu::AddResetCallbackIfCapturingCUDAGraph;
47+
48+
using phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph;
13449

13550
class SkipCUDAGraphCaptureGuard {
13651
DISABLE_COPY_AND_ASSIGN(SkipCUDAGraphCaptureGuard);

paddle/fluid/platform/device/gpu/cuda/cuda_graph.h

Lines changed: 0 additions & 78 deletions
This file was deleted.

paddle/fluid/platform/device/gpu/gpu_info.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ limitations under the License. */
3636
#ifdef PADDLE_WITH_HIP
3737
#include "paddle/fluid/platform/dynload/miopen.h"
3838
#else
39-
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
4039
#include "paddle/fluid/platform/dynload/cudnn.h"
40+
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
4141
#endif
4242

4343
#ifdef PADDLE_WITH_CUDA
@@ -230,7 +230,7 @@ class RecordedGpuMallocHelper {
230230
result = hipMalloc(ptr, size);
231231
}
232232
#else
233-
CUDAGraphCaptureModeGuard capture_mode_guard;
233+
phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard;
234234
if (UNLIKELY(malloc_managed_memory)) {
235235
result = cudaMallocManaged(ptr, size);
236236
} else {

0 commit comments

Comments
 (0)