PaddlePaddle · luotao1 · Jul 9, 2025 · Jul 4, 2025
diff --git a/paddle/phi/kernels/gpu/adagrad_kernel.cu b/paddle/phi/kernels/gpu/adagrad_kernel.cu
@@ -153,7 +153,7 @@ __global__ void SparseAdagradFunctorKernel(const T* grad,
 
 template <typename T>
 struct SparseAdagradFunctor<phi::GPUContext, T> {
- void operator()(const phi::GPUContext& context,
+ void operator()(const phi::GPUContext& dev_ctx,
  const phi::SelectedRows& grad,
  const DenseTensor& learning_rate,
  T epsilon,
@@ -162,15 +162,15 @@ struct SparseAdagradFunctor<phi::GPUContext, T> {
  // 1. g_m.rows = set(g.rows)
  auto grad_width = grad.value().dims()[1];
  phi::funcs::scatter::MergeAdd<phi::GPUContext, T> merge_func;
- auto grad_merge = merge_func(context, grad);
+ auto grad_merge = merge_func(dev_ctx, grad);
  auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
  phi::Vector<int64_t> merge_rows(grad_merge.rows());
  // 2. m += g_m * g_m
  auto grad_square =
- SquareSelectedRows<phi::GPUContext, T>(context, grad_merge);
+ SquareSelectedRows<phi::GPUContext, T>(dev_ctx, grad_merge);
 
  phi::funcs::SelectedRowsAddToTensor<phi::GPUContext, T> functor;
- functor(context, grad_square, moment);
+ functor(dev_ctx, grad_square, moment);
 
  // 3. update parameter
  auto* lr = learning_rate.data<T>();
@@ -185,9 +185,9 @@ struct SparseAdagradFunctor<phi::GPUContext, T> {
  <<<grid2,
  threads,
  0,
- reinterpret_cast<const phi::GPUContext&>(context).stream()>>>(
+ reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream()>>>(
  grad_merge_data,
- mixv_merge_rows.CUDAMutableData(context.GetPlace()),
+ mixv_merge_rows.CUDAMutableData(dev_ctx.GetPlace()),
  lr,
  param_data,
  moment_data,

diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -133,7 +133,7 @@ template <typename DeviceContext,
  bool fuse_relu_before_conv = false>
 class DepthwiseConvFunctor {
  public:
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  const phi::DenseTensor& filter,
  const std::vector<int>& strides,
@@ -148,7 +148,7 @@ template <typename DeviceContext,
  bool fuse_relu_before_conv = false>
 class DepthwiseConvInputGradFunctor {
  public:
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  const phi::DenseTensor& filter,
  const phi::DenseTensor& output_grad,
@@ -164,7 +164,7 @@ template <typename DeviceContext,
  bool fuse_relu_before_conv = false>
 class DepthwiseConvFilterGradFunctor {
  public:
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  const phi::DenseTensor& output_grad,
  const std::vector<int>& strides,
@@ -1450,7 +1450,7 @@ __global__ void KernelDepthwiseConvFilterGradSp(const T* output_grad_data,
 template <class T, bool fuse_relu_before_conv>
 class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  public:
- void operator()(const phi::GPUContext& context,
+ void operator()(const phi::GPUContext& dev_ctx,
  const phi::DenseTensor& input,
  const phi::DenseTensor& filter,
  const std::vector<int>& strides,
@@ -1485,7 +1485,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
 
  const T* input_data = input.data<T>();
  const T* filter_data = filter.data<T>();
- T* output_data = context.template Alloc<T>(output);
+ T* output_data = dev_ctx.template Alloc<T>(output);
 
  phi::DenseTensor filter_hwc;
  if (data_layout == DataLayout::kNHWC) {
@@ -1494,10 +1494,10 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  filter.dims()[0],
  filter.dims()[1]});
  filter_hwc.Resize(filter_hwc_dims);
- context.template Alloc<T>(&filter_hwc);
+ dev_ctx.template Alloc<T>(&filter_hwc);
  std::vector<int> perm_axis({2, 3, 0, 1});
  phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
- trans(context, filter, &filter_hwc, perm_axis);
+ trans(dev_ctx, filter, &filter_hwc, perm_axis);
  filter_data = filter_hwc.data<T>();
  }
 
@@ -1546,7 +1546,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  c_filter, \
  DataLayout::kNCHW, \
  fuse_relu_before_conv> \
- <<<grid, threads, 0, context.stream()>>>(input_data, \
+ <<<grid, threads, 0, dev_ctx.stream()>>>(input_data, \
  filter_data, \
  batch_size, \
  output_channels, \
@@ -1572,7 +1572,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  c_filter, \
  DataLayout::kNHWC, \
  fuse_relu_before_conv> \
- <<<grid, threads, 0, context.stream()>>>(input_data, \
+ <<<grid, threads, 0, dev_ctx.stream()>>>(input_data, \
  filter_data, \
  batch_size, \
  output_channels, \
@@ -1616,7 +1616,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
 template <typename T, bool fuse_relu_before_conv>
 class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  public:
- void operator()(const phi::GPUContext& context,
+ void operator()(const phi::GPUContext& dev_ctx,
  const phi::DenseTensor& input,
  const phi::DenseTensor& filter,
  const phi::DenseTensor& output_grad,
@@ -1653,7 +1653,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  const T* input_data = input.data<T>();
  const T* filter_data = filter.data<T>();
  const T* output_grad_data = output_grad.data<T>();
- T* input_grad_data = context.template Alloc<T>(input_grad);
+ T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
 
  phi::DenseTensor filter_hwc;
  if (data_layout == DataLayout::kNHWC) {
@@ -1662,10 +1662,10 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  filter.dims()[0],
  filter.dims()[1]});
  filter_hwc.Resize(filter_hwc_dims);
- context.template Alloc<T>(&filter_hwc);
+ dev_ctx.template Alloc<T>(&filter_hwc);
  std::vector<int> perm_axis({2, 3, 0, 1});
  phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
- trans(context, filter, &filter_hwc, perm_axis);
+ trans(dev_ctx, filter, &filter_hwc, perm_axis);
  filter_data = filter_hwc.data<T>();
  }
 
@@ -1715,7 +1715,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  c_filter, \
  DataLayout::kNCHW, \
  fuse_relu_before_conv> \
- <<<grid, threads, 0, context.stream()>>>(input_data, \
+ <<<grid, threads, 0, dev_ctx.stream()>>>(input_data, \
  output_grad_data, \
  filter_data, \
  batch_size, \
@@ -1742,7 +1742,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  c_filter, \
  DataLayout::kNHWC, \
  fuse_relu_before_conv> \
- <<<grid, threads, 0, context.stream()>>>(input_data, \
+ <<<grid, threads, 0, dev_ctx.stream()>>>(input_data, \
  output_grad_data, \
  filter_data, \
  batch_size, \
@@ -1789,7 +1789,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
  T,
  fuse_relu_before_conv> {
  public:
- void operator()(const phi::GPUContext& context,
+ void operator()(const phi::GPUContext& dev_ctx,
  const phi::DenseTensor& input,
  const phi::DenseTensor& output_grad,
  const std::vector<int>& strides,
@@ -1824,7 +1824,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
 
  const T* input_data = input.data<T>();
  const T* output_grad_data = output_grad.data<T>();
- T* filter_grad_data = context.template Alloc<T>(filter_grad);
+ T* filter_grad_data = dev_ctx.template Alloc<T>(filter_grad);
 
  int block_size = 512;
  int blocks;
@@ -1875,7 +1875,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
  c_filter, \
  DataLayout::kNCHW, \
  fuse_relu_before_conv> \
- <<<grid, threads, 0, context.stream()>>>(output_grad_data, \
+ <<<grid, threads, 0, dev_ctx.stream()>>>(output_grad_data, \
  input_data, \
  batch_size, \
  output_channels, \
@@ -1902,9 +1902,9 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
  filter_grad->dims()[0], \
  filter_grad->dims()[1]}); \
  filter_grad_hwc.Resize(filter_grad_hwc_dims); \
- context.template Alloc<T>(&filter_grad_hwc); \
+ dev_ctx.template Alloc<T>(&filter_grad_hwc); \
  phi::funcs::SetConstant<phi::GPUContext, T> set_zero; \
- set_zero(context, &filter_grad_hwc, static_cast<T>(0)); \
+ set_zero(dev_ctx, &filter_grad_hwc, static_cast<T>(0)); \
  filter_grad_data = filter_grad_hwc.data<T>(); \
  } else { \
  block_size = 512; \
@@ -1924,7 +1924,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
  c_filter, \
  DataLayout::kNHWC, \
  fuse_relu_before_conv> \
- <<<grid, threads, 0, context.stream()>>>(output_grad_data, \
+ <<<grid, threads, 0, dev_ctx.stream()>>>(output_grad_data, \
  input_data, \
  batch_size, \
  output_channels, \
@@ -1946,7 +1946,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
  if (c_filter != -1) { \
  std::vector<int> perm_axis({2, 3, 0, 1}); \
  phi::funcs::TransposeNormal<phi::GPUContext, T> trans; \
- trans(context, filter_grad_hwc, filter_grad, perm_axis); \
+ trans(dev_ctx, filter_grad_hwc, filter_grad, perm_axis); \
  } \
  } \
  return; \

diff --git a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
@@ -23,7 +23,7 @@
 namespace phi {
 
 template <typename T, typename Context>
-void ExpandAsGradKernel(const Context& context,
+void ExpandAsGradKernel(const Context& dev_ctx,
  const DenseTensor& x,
  const DenseTensor& out_grad,
  const std::vector<int64_t>& target_shape,
@@ -41,14 +41,14 @@ void ExpandAsGradKernel(const Context& context,
  "to 6, but the value received is %d.",
  out_rank));
 
- context.template Alloc<T>(in_grad);
+ dev_ctx.template Alloc<T>(in_grad);
  if (in_dims == out_dims) {
- phi::Copy(context, out_grad, context.GetPlace(), false, in_grad);
+ phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, in_grad);
  } else {
  std::vector<int> reduce_dims = funcs::GetReduceDim(in_dims, out_dims, -1);
 
  phi::SumKernel<T, Context>(
- context, out_grad, reduce_dims, out_grad.dtype(), false, in_grad);
+ dev_ctx, out_grad, reduce_dims, out_grad.dtype(), false, in_grad);
  }
 }
 

diff --git a/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu b/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu
@@ -49,7 +49,7 @@ inline __global__ void sequence_expand_grad_kernel(const T* dout_data,
 
 template <typename T>
 struct SequenceExpandGradFunctor<phi::GPUContext, T> {
- void operator()(const phi::GPUContext& context,
+ void operator()(const phi::GPUContext& dev_ctx,
  const DenseTensor& dout,
  const phi::Vector<size_t>& x_lod, /*expand source lod*/
  const phi::Vector<size_t>& ref_lod, /*expand based lod*/
@@ -67,14 +67,14 @@ struct SequenceExpandGradFunctor<phi::GPUContext, T> {
  phi::MixVector<size_t> mixv_ref_lod(&ref_lod);
  phi::MixVector<size_t> mixv_x_lod(&x_lod);
  phi::MixVector<size_t> mixv_out_offset(&out_offset);
- sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+ sequence_expand_grad_kernel<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
  dout.data<T>(),
- mixv_ref_lod.CUDAData(context.GetPlace()),
- mixv_x_lod.CUDAData(context.GetPlace()),
- mixv_out_offset.CUDAData(context.GetPlace()),
+ mixv_ref_lod.CUDAData(dev_ctx.GetPlace()),
+ mixv_x_lod.CUDAData(dev_ctx.GetPlace()),
+ mixv_out_offset.CUDAData(dev_ctx.GetPlace()),
  ref_lod.size(),
  x_item_length,
- context.template Alloc<T>(dx));
+ dev_ctx.template Alloc<T>(dx));
  }
 };
 

diff --git a/paddle/phi/kernels/gpu/sequence_expand_kernel.cu b/paddle/phi/kernels/gpu/sequence_expand_kernel.cu
@@ -18,7 +18,7 @@
 namespace phi {
 
 template <typename T>
-static inline int ExpandByMemoryCopy(const phi::GPUContext& context,
+static inline int ExpandByMemoryCopy(const phi::GPUContext& dev_ctx,
  const DenseTensor& x,
  DenseTensor* out,
  const phi::Vector<size_t>& x_lod,
@@ -27,7 +27,7 @@ static inline int ExpandByMemoryCopy(const phi::GPUContext& context,
  auto out_data = out->data<T>();
  auto x_data = x.data<T>();
 
- const auto& gpu_place = context.GetPlace();
+ const auto& gpu_place = dev_ctx.GetPlace();
 
  int x_item_length = x.numel() / x.dims()[0];
  int out_offset = 0;
@@ -51,7 +51,7 @@ static inline int ExpandByMemoryCopy(const phi::GPUContext& context,
  gpu_place,
  x_data + (x_start + k) * x_item_length,
  sizeof(T) * x_item_length,
- context.stream());
+ dev_ctx.stream());
  }
  }
  } else {
@@ -93,16 +93,16 @@ inline __global__ void sequence_expand_kernel(const T* x_data,
 
 template <typename T>
 struct SequenceExpandFunctor<phi::GPUContext, T> {
- void operator()(const phi::GPUContext& context,
+ void operator()(const phi::GPUContext& dev_ctx,
  const DenseTensor& x,
  const phi::Vector<size_t>& x_lod, /*expand source lod*/
  const phi::Vector<size_t>& ref_lod, /*expand referenced lod*/
  DenseTensor* out) {
  int num_copies =
- ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, false);
+ ExpandByMemoryCopy<T>(dev_ctx, x, out, x_lod, ref_lod, false);
  // Sometimes direct copies will be faster, this maybe need deeply analysis.
  if (num_copies < 5) {
- ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, true);
+ ExpandByMemoryCopy<T>(dev_ctx, x, out, x_lod, ref_lod, true);
  } else {
  int x_item_length = x.numel() / x.dims()[0];
  size_t x_lod_size = x_lod.size();
@@ -118,7 +118,7 @@ struct SequenceExpandFunctor<phi::GPUContext, T> {
 
  phi::MixVector<size_t> mixv_out_offset(&out_offset);
  const size_t* out_offset_data =
- mixv_out_offset.CUDAData(context.GetPlace());
+ mixv_out_offset.CUDAData(dev_ctx.GetPlace());
  const size_t* x_lod_data = out_offset_data + x_lod_size;
  const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size;
 
@@ -130,14 +130,14 @@ struct SequenceExpandFunctor<phi::GPUContext, T> {
  dim3 block_size(thread_x, thread_y, thread_z);
  dim3 grid_size(block_x, 1);
 
- sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+ sequence_expand_kernel<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
  x.data<T>(),
  x_lod_data,
  ref_lod_data,
  out_offset_data,
  x_lod_size,
  x_item_length,
- context.template Alloc<T>(out));
+ dev_ctx.template Alloc<T>(out));
  }
  }
 };

diff --git a/paddle/phi/kernels/gpu/sequence_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/sequence_softmax_grad_kernel.cu
@@ -71,7 +71,7 @@ __global__ void sequence_softmax_grad_kernel(const T *softmax_grad_data,
 
 template <typename T>
 struct SequenceSoftmaxGradFunctor<phi::GPUContext, T> {
- void operator()(const phi::GPUContext &context,
+ void operator()(const phi::GPUContext &dev_ctx,
  const DenseTensor &dout,
  const DenseTensor &out,
  const phi::Vector<size_t> &ref_lod, /*referenced lod*/
@@ -80,20 +80,20 @@ struct SequenceSoftmaxGradFunctor<phi::GPUContext, T> {
 
  const int kThreadsPerBlock = 32;
  int thread_x = kThreadsPerBlock;
- int max_threads = context.GetMaxPhysicalThreadCount();
+ int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
  int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
 
  dim3 block_size(thread_x);
  dim3 grid_size(max_blocks);
 
  phi::MixVector<size_t> mixv_ref_lod(&ref_lod);
  sequence_softmax_grad_kernel<T, kThreadsPerBlock>
- <<<grid_size, block_size, 0, context.stream()>>>(
+ <<<grid_size, block_size, 0, dev_ctx.stream()>>>(
  dout.data<T>(),
  out.data<T>(),
- mixv_ref_lod.CUDAData(context.GetPlace()),
+ mixv_ref_lod.CUDAData(dev_ctx.GetPlace()),
  height,
- context.Alloc<T>(dx));
+ dev_ctx.Alloc<T>(dx));
  }
 };