PaddlePaddle · YuanRisheng · Dec 20, 2022 · Dec 17, 2022 · Dec 17, 2022 · Dec 17, 2022
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -39,7 +39,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 #endif
 
 #if CUDA_VERSION >= 10020
@@ -157,7 +157,7 @@ class CUDAGraphAllocator
 
 static bool IsCUDAGraphCapturing() {
 #ifdef PADDLE_WITH_CUDA
- return UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing());
+ return UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing());
 #else
  return false;
 #endif
@@ -1007,7 +1007,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
 AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
 #ifdef PADDLE_WITH_CUDA
  if (UNLIKELY(IsCUDAGraphCapturing())) {
- auto id = platform::CUDAGraph::CapturingPoolID();
+ auto id = phi::backends::gpu::CUDAGraph::CapturingPoolID();
  auto iter = cuda_graph_map_.find(id);
  PADDLE_ENFORCE_NE(
  iter,

diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 #endif
 
 namespace paddle {
@@ -49,7 +49,7 @@ void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) {
 
  std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
 #ifdef PADDLE_WITH_CUDA
- if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+ if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) {
  graph_capturing_stream_set_.insert(stream);
  return;
  }
@@ -61,7 +61,7 @@ void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) {
 
 bool StreamSafeCUDAAllocation::CanBeFreed() {
 #ifdef PADDLE_WITH_CUDA
- if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+ if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) {
  return graph_capturing_stream_set_.empty() &&
  outstanding_event_map_.empty();
  }

diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -319,7 +319,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
  data, result, data_num_);
  RecordStream(data_allocation, other_stream);
 
- std::unique_ptr<platform::CUDAGraph> cuda_graph =
+ std::unique_ptr<phi::backends::gpu::CUDAGraph> cuda_graph =
  platform::EndCUDAGraphCapture();
 
  int replay_times = 10;

diff --git a/paddle/fluid/operators/cuda_graph_with_in_out.h b/paddle/fluid/operators/cuda_graph_with_in_out.h
@@ -89,7 +89,7 @@ class CUDAGraphWithInOuts {
  int64_t PoolID() const { return graph_->PoolID(); }
 
  private:
- std::unique_ptr<platform::CUDAGraph> graph_;
+ std::unique_ptr<phi::backends::gpu::CUDAGraph> graph_;
  std::vector<phi::DenseTensor> ins_;
  std::vector<phi::DenseTensor> outs_;
  std::vector<int64_t> in_indices_;

diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/operators/dropout_impl.cu.h"
 #include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/dropout_impl.cu.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/functors.h"
@@ -206,7 +206,7 @@ class FMHARef {
  stride_b = gemm_k * gemm_n;
 
  if (dropout_param_.dropout_prob_) {
- DropoutFwGPUKernelDriver<T>(
+ phi::funcs::DropoutFwGPUKernelDriver<T>(
  static_cast<const phi::GPUContext&>(dev_ctx_),
  dropout_param_.is_test_,
  dropout_param_.dropout_prob_,
@@ -381,7 +381,7 @@ class FMHARef {
  stride_b = gemm_k * gemm_n;
 
  if (dropout_param_.dropout_prob_) {
- DropoutFwGPUKernelDriver<T>(
+ phi::funcs::DropoutFwGPUKernelDriver<T>(
  static_cast<const phi::GPUContext&>(dev_ctx_),
  dropout_param_.is_test_,
  dropout_param_.dropout_prob_,
@@ -552,7 +552,7 @@ class FMHARef {
  }
  // dropout bw
  if (dropout_param_.dropout_prob_) {
- DropoutGradGPUKernelDriver<T>(
+ phi::funcs::DropoutGradGPUKernelDriver<T>(
  static_cast<const phi::GPUContext&>(dev_ctx_),
  false,
  dropout_param_.dropout_prob_,

diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/operators/dropout_impl_util.h"
 #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h"
 #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h"
 #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
+#include "paddle/phi/kernels/funcs/dropout_impl_util.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 #include "paddle/phi/kernels/layer_norm_kernel.h"
 
@@ -106,7 +106,7 @@ struct DropoutParam {
 
  int UpdateSeedAndIncrement(const phi::GPUContext& ctx, const int offset) {
  uint64_t tmp_increment;
- GetSeedDataAndIncrement(
+ phi::funcs::GetSeedDataAndIncrement(
  ctx, tensor_seed, fix_seed, seed_val, offset, &seed, &tmp_increment);
  increment = static_cast<int>(tmp_increment);
  return increment;

diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -15,18 +15,18 @@
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 
 DECLARE_bool(use_stream_safe_cuda_allocator);
 
 namespace paddle {
 namespace platform {
 
 #ifdef PADDLE_WITH_CUDA
-void BeginCUDAGraphCapture(platform::CUDAPlace place,
+void BeginCUDAGraphCapture(phi::GPUPlace place,
  cudaStreamCaptureMode mode,
  int64_t pool_id) {
- auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+ auto* mutable_dev_ctx = phi::DeviceContextPool::Instance().Get(place);
  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(mutable_dev_ctx);
  dev_ctx->cudnn_workspace_handle().ResetWorkspace();
 
@@ -64,7 +64,7 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
 
 std::unique_ptr<CUDAGraph> EndCUDAGraphCapture() {
  auto place = CUDAGraph::CapturingPlace();
- auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+ auto* mutable_dev_ctx = phi::DeviceContextPool::Instance().Get(place);
  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(mutable_dev_ctx);
  dev_ctx->cudnn_workspace_handle().ResetWorkspace();
  dev_ctx->SetCUDAGraphAllocator(nullptr);

diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
@@ -14,123 +14,38 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
-#endif
+#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/macros.h"
 
 namespace paddle {
 namespace platform {
 
-#ifdef PADDLE_WITH_CUDA
-#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond, \
- __kernel_func, \
- __grid, \
- __block, \
- __sm_size, \
- __stream, \
- __seed_inc, \
- __seed_expr, \
- __offset_expr, \
- ...) \
- do { \
- if (::paddle::platform::CUDAGraph::IsThisThreadCapturing() && (__cond)) { \
- using __Helper = \
- ::paddle::platform::IsSameKernelHelper<decltype(&__kernel_func), \
- &__kernel_func>; \
- auto *dev_ctx = \
- ::paddle::platform::DeviceContextPool::Instance().GetByPlace( \
- ::paddle::platform::CUDAGraph::CapturingPlace()); \
- auto __set_seed_func = \
- [=](::paddle::platform::CUDAKernelParams *__params, \
- bool __check_only) -> bool { \
- if (__check_only) { \
- return __params->func() == &__kernel_func && \
- __Helper::Compare(*__params, __VA_ARGS__); \
- } \
- auto &KERNEL_PARAMS = *__params; \
- uint64_t __seed, __offset; \
- ::paddle::operators::GetSeedDataAndIncrement( \
- *dev_ctx, nullptr, false, 0, __seed_inc, &__seed, &__offset); \
- __seed_expr = static_cast<decltype(__seed_expr)>(__seed); \
- __offset_expr = static_cast<decltype(__offset_expr)>(__offset); \
- return true; \
- }; \
- ::paddle::platform::CUDAGraph::RecordRandomKernelInfo(__set_seed_func); \
- } \
- __kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__); \
- } while (0)
-#else
-#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond, \
- __kernel_func, \
- __grid, \
- __block, \
- __sm_size, \
- __stream, \
- __seed_inc, \
- __seed_expr, \
- __offset_expr, \
- ...) \
- do { \
- __kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__); \
- } while (0)
-#endif
-
 // NOTE: These APIs are not thread-safe.
 #ifdef PADDLE_WITH_CUDA
-void BeginCUDAGraphCapture(platform::CUDAPlace place,
+using CUDAGraph = phi::backends::gpu::CUDAGraph;
+
+void BeginCUDAGraphCapture(phi::GPUPlace place,
  cudaStreamCaptureMode mode,
  int64_t pool_id = CUDAGraph::kInvalidPoolID);
 std::unique_ptr<CUDAGraph> EndCUDAGraphCapture();
 #endif
 
-inline bool IsCUDAGraphCapturing() {
-#ifdef PADDLE_WITH_CUDA
- return CUDAGraph::IsCapturing();
-#else
- return false;
-#endif
-}
-
-inline platform::CUDAPlace CUDAGraphCapturingPlace() {
+inline phi::GPUPlace CUDAGraphCapturingPlace() {
 #ifdef PADDLE_WITH_CUDA
  return CUDAGraph::CapturingPlace();
 #else
- PADDLE_THROW(platform::errors::Unimplemented(
+ PADDLE_THROW(phi::errors::Unimplemented(
  "CUDA Graph is only supported on NVIDIA GPU device."));
 #endif
 }
 
-// Add reset callback if CUDA Graph is capturing.
-// Otherwise, invoke callback directly.
-template <typename Callback>
-inline void AddResetCallbackIfCapturingCUDAGraph(Callback &&callback) {
-#ifdef PADDLE_WITH_CUDA
- if (UNLIKELY(IsCUDAGraphCapturing())) {
- return CUDAGraph::AddResetCallbackDuringCapturing(
- std::forward<Callback>(callback));
- }
-#endif
- callback();
-}
+using phi::backends::gpu::IsCUDAGraphCapturing;
 
-template <typename T>
-inline T *RestoreHostMemIfCapturingCUDAGraph(T *host_mem, size_t size) {
- static_assert(std::is_trivial<T>::value, "T must be trivial type");
- static_assert(!std::is_same<T, void>::value, "T cannot be void");
-#ifdef PADDLE_WITH_CUDA
- if (UNLIKELY(IsCUDAGraphCapturing())) {
- size_t nbytes = size * sizeof(T);
- void *new_host_mem = new uint8_t[nbytes];
- std::memcpy(new_host_mem, host_mem, nbytes);
- AddResetCallbackIfCapturingCUDAGraph(
- [new_host_mem] { delete[] reinterpret_cast<uint8_t *>(new_host_mem); });
- return reinterpret_cast<T *>(new_host_mem);
- }
-#endif
- return host_mem;
-}
+using phi::backends::gpu::AddResetCallbackIfCapturingCUDAGraph;
+
+using phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph;
 
 class SkipCUDAGraphCaptureGuard {
  DISABLE_COPY_AND_ASSIGN(SkipCUDAGraphCaptureGuard);

diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -36,8 +36,8 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/miopen.h"
 #else
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 #endif
 
 #ifdef PADDLE_WITH_CUDA
@@ -230,7 +230,7 @@ class RecordedGpuMallocHelper {
  result = hipMalloc(ptr, size);
  }
 #else
- CUDAGraphCaptureModeGuard capture_mode_guard;
+ phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard;
  if (UNLIKELY(malloc_managed_memory)) {
  result = cudaMallocManaged(ptr, size);
  } else {