PaddlePaddle · luotao1 · Jul 29, 2025 · Jul 25, 2025 · Jul 25, 2025
diff --git a/paddle/phi/kernels/funcs/cross_entropy.h b/paddle/phi/kernels/funcs/cross_entropy.h
@@ -76,7 +76,7 @@ struct TolerableValue<phi::dtype::bfloat16> {
 template <typename DeviceContext, typename T>
 class CrossEntropyFunctor {
  public:
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  phi::DenseTensor* out,
  const phi::DenseTensor* prob,
  const phi::DenseTensor* labels,

diff --git a/paddle/phi/kernels/funcs/fc_functor.h b/paddle/phi/kernels/funcs/fc_functor.h
@@ -25,7 +25,7 @@ namespace funcs {
 template <typename DeviceContext, typename T>
 class FCFunctor {
  public:
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const int M,
  const int N,
  const int K,
@@ -40,7 +40,7 @@ class FCFunctor {
 template <typename DeviceContext, typename T>
 class FCInt8Functor {
  public:
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const int M,
  const int N,
  const int K,

diff --git a/paddle/phi/kernels/funcs/math/unpooling.h b/paddle/phi/kernels/funcs/math/unpooling.h
@@ -22,15 +22,15 @@ namespace math {
 template <typename DeviceContext, typename T>
 class Unpool2dMaxFunctor {
  public:
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  const phi::DenseTensor& indices,
  phi::DenseTensor* output);
 };
 template <typename DeviceContext, class T>
 class Unpool2dMaxGradFunctor {
  public:
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  const phi::DenseTensor& indices,
  const phi::DenseTensor& output,
@@ -41,15 +41,15 @@ class Unpool2dMaxGradFunctor {
 template <typename DeviceContext, typename T>
 class Unpool3dMaxFunctor {
  public:
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  const phi::DenseTensor& indices,
  phi::DenseTensor* output);
 };
 template <typename DeviceContext, class T>
 class Unpool3dMaxGradFunctor {
  public:
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  const phi::DenseTensor& indices,
  const phi::DenseTensor& output,

diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
@@ -43,70 +43,70 @@ void BatchTranspose(T* output,
 template <typename DeviceContext, typename T>
 struct TransposeNormal {
  // for dims >= 7 situation
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& in,
  phi::DenseTensor* out,
  const std::vector<int>& axis);
 };
 
 template <typename DeviceContext, typename T, int Rank>
 struct Transpose {
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& in,
  phi::DenseTensor* out,
  const std::vector<int>& axis);
 };
 
 template <typename DeviceContext, typename T>
 struct SetConstant {
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  phi::DenseTensor* tensor,
  T num);
 };
 
 #ifdef PADDLE_WITH_XPU
 template <typename T>
 struct SetConstant<phi::XPUContext, T> {
- void operator()(const phi::XPUContext& context,
+ void operator()(const phi::XPUContext& dev_ctx,
  phi::DenseTensor* tensor,
  T num);
 };
 #endif
 
 template <typename Place>
-void set_constant_with_place(const phi::DeviceContext& context,
+void set_constant_with_place(const phi::DeviceContext& dev_ctx,
  phi::DenseTensor* tensor,
  float value);
 
-void set_constant(const phi::DeviceContext& context,
+void set_constant(const phi::DeviceContext& dev_ctx,
  phi::DenseTensor* tensor,
  float value);
 
 template <typename DeviceContext, typename T>
 struct RowwiseAdd {
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  const phi::DenseTensor& vec,
  phi::DenseTensor* output);
 };
 
 template <typename DeviceContext, typename T>
 struct ColwiseSum {
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  phi::DenseTensor* vec);
 };
 
 template <typename DeviceContext, typename T>
 struct RowwiseSum {
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  phi::DenseTensor* vec);
 };
 
 template <typename DeviceContext, typename T>
 struct RowwiseMean {
- void operator()(const DeviceContext& context,
+ void operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  phi::DenseTensor* vec);
 };

diff --git a/paddle/phi/kernels/funcs/math_function_impl.h b/paddle/phi/kernels/funcs/math_function_impl.h
@@ -26,26 +26,26 @@ namespace funcs {
 using phi::To32BitIndex;
 
 template <typename DeviceContext, typename T>
-void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
+void SetConstant<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
  phi::DenseTensor* tensor,
  T num) {
  auto t = phi::EigenVector<T>::Flatten(*tensor);
- t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
+ t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(num));
 }
 
 #ifdef PADDLE_WITH_XPU
 template <typename T>
-void SetConstant<phi::XPUContext, T>::operator()(const phi::XPUContext& context,
+void SetConstant<phi::XPUContext, T>::operator()(const phi::XPUContext& dev_ctx,
  phi::DenseTensor* tensor,
  T num) {
  phi::VisitDataType(tensor->dtype(),
- TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
+ TensorSetConstantXPU<T>(tensor, num, dev_ctx.GetPlace()));
 }
 #endif
 
 template <typename DeviceContext, typename T, int Rank>
 void Transpose<DeviceContext, T, Rank>::operator()(
- const DeviceContext& context,
+ const DeviceContext& dev_ctx,
  const phi::DenseTensor& in,
  phi::DenseTensor* out,
  const std::vector<int>& axis) {
@@ -55,10 +55,10 @@ void Transpose<DeviceContext, T, Rank>::operator()(
  }
  auto eigen_in = phi::EigenTensor<T, Rank>::From(in);
  auto eigen_out = phi::EigenTensor<T, Rank>::From(*out);
- auto* dev = context.eigen_device();
+ auto* dev = dev_ctx.eigen_device();
  // use 32bit index to speed up computation
  bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
- bool is_gpu_place = context.GetPlace().GetType() == phi::AllocationType::GPU;
+ bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU;
  if (use_32bit_index && is_gpu_place) {
  To32BitIndex(eigen_out).device(*dev) =
  To32BitIndex(eigen_in).shuffle(permute);
@@ -68,7 +68,7 @@ void Transpose<DeviceContext, T, Rank>::operator()(
 }
 
 template <typename DeviceContext, typename T>
-void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  phi::DenseTensor* out) {
  auto in_dims = input.dims();
@@ -85,7 +85,7 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
  auto in = phi::EigenMatrix<T>::From(input);
  auto vec = phi::EigenVector<T>::Flatten(*out);
 
- vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
+ vec.device(*dev_ctx.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
 }
 
 // Specialize for CPU, since Eigen implement a general reduce. However,
@@ -94,7 +94,7 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
 template <typename T>
 class ColwiseSum<phi::CPUContext, T> {
  public:
- void operator()(const phi::CPUContext& context,
+ void operator()(const phi::CPUContext& dev_ctx,
  const phi::DenseTensor& input,
  phi::DenseTensor* out) {
  auto& in_dims = input.dims();
@@ -110,7 +110,7 @@ class ColwiseSum<phi::CPUContext, T> {
  size,
  out->numel()));
 
- T* out_buf = context.template Alloc<T>(out);
+ T* out_buf = dev_ctx.template Alloc<T>(out);
  const T* in_buf = input.data<T>();
 
  for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -126,7 +126,7 @@ class ColwiseSum<phi::CPUContext, T> {
 };
 
 template <typename DeviceContext, typename T>
-void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
+void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  phi::DenseTensor* out) {
  auto in_dims = input.dims();
@@ -148,7 +148,7 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
  auto in = phi::EigenMatrix<T>::From(input);
  auto vec = phi::EigenVector<T>::Flatten(*out);
 
- vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
+ vec.device(*dev_ctx.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
 }
 // TODO(zcd): Following ColwiseSum format, need to confirm.
 // Specialize for CPU, since Eigen implement a general reduce. However,
@@ -157,7 +157,7 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
 template <typename T>
 class RowwiseMean<phi::CPUContext, T> {
  public:
- void operator()(const phi::CPUContext& context,
+ void operator()(const phi::CPUContext& dev_ctx,
  const phi::DenseTensor& input,
  phi::DenseTensor* out) {
  auto& in_dims = input.dims();
@@ -179,7 +179,7 @@ class RowwiseMean<phi::CPUContext, T> {
  height,
  out->numel()));
  auto inv_size = 1.0 / size;
- T* out_buf = context.template Alloc<T>(out);
+ T* out_buf = dev_ctx.template Alloc<T>(out);
  const T* in_buf = input.data<T>();
 
  for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -193,7 +193,7 @@ class RowwiseMean<phi::CPUContext, T> {
 };
 
 template <typename DeviceContext, typename T>
-void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  phi::DenseTensor* out) {
  auto in_dims = input.dims();
@@ -215,7 +215,7 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
  auto in = phi::EigenMatrix<T>::From(input);
  auto vec = phi::EigenVector<T>::Flatten(*out);
 
- vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
+ vec.device(*dev_ctx.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
 }
 // TODO(zcd): Following ColwiseSum format, need to confirm.
 // Specialize for CPU, since Eigen implement a general reduce. However,
@@ -224,7 +224,7 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
 template <typename T>
 class RowwiseSum<phi::CPUContext, T> {
  public:
- void operator()(const phi::CPUContext& context,
+ void operator()(const phi::CPUContext& dev_ctx,
  const phi::DenseTensor& input,
  phi::DenseTensor* out) {
  auto& in_dims = input.dims();
@@ -246,7 +246,7 @@ class RowwiseSum<phi::CPUContext, T> {
  height,
  out->numel()));
 
- T* out_buf = context.template Alloc<T>(out);
+ T* out_buf = dev_ctx.template Alloc<T>(out);
  const T* in_buf = input.data<T>();
 
  for (size_t i = 0; i < static_cast<size_t>(height); ++i) {

diff --git a/paddle/phi/kernels/funcs/maxouting.cc b/paddle/phi/kernels/funcs/maxouting.cc
@@ -20,7 +20,7 @@ namespace phi::funcs {
 
 // All tensors are in NCHW or NHWC format, and the groups must be greater than 1
 template <typename DeviceContext, typename T>
-void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  phi::DenseTensor* output,
  const int groups,
@@ -35,7 +35,7 @@ void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
  // c_size means the output size of each sample
  int c_size = fea_size * output_channels;
  const T* input_data = input.data<T>();
- T* output_data = context.template Alloc<T>(output);
+ T* output_data = dev_ctx.template Alloc<T>(output);
  for (int i = 0; i < batch_size; ++i) {
  int new_bindex = c_size * i;
  for (int c = 0; c < output_channels; ++c) {
@@ -65,7 +65,7 @@ void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
 
 template <typename DeviceContext, typename T>
 void MaxOutGradFunctor<DeviceContext, T>::operator()(
- const DeviceContext& context,
+ const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  phi::DenseTensor* input_grad,
  const phi::DenseTensor& output,
@@ -82,7 +82,7 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
  const T* input_data = input.data<T>();
  const T* output_data = output.data<T>();
  const T* output_grad_data = output_grad.data<T>();
- T* input_grad_data = context.template Alloc<T>(input_grad);
+ T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
  for (int i = 0; i < batch_size; ++i) {
  int blen = fea_size * output_channels * i;
  for (int c = 0; c < output_channels; ++c) {

diff --git a/paddle/phi/kernels/funcs/maxouting.cu b/paddle/phi/kernels/funcs/maxouting.cu
@@ -111,7 +111,7 @@ __global__ void KernelMaxoutGrad(const int64_t nthreads,
 }
 
 template <typename DeviceContext, typename T>
-void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  phi::DenseTensor* output,
  const int groups,
@@ -122,13 +122,13 @@ void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
  const int64_t input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
 
  const T* input_data = input.data<T>();
- T* output_data = context.template Alloc<T>(output);
+ T* output_data = dev_ctx.template Alloc<T>(output);
  int64_t nthreads = static_cast<int64_t>(output->numel());
  int64_t blocks = static_cast<int64_t>((nthreads + 1024 - 1) / 1024);
  dim3 threads(1024, 1);
  dim3 grid(blocks, 1);
 
- KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(nthreads,
+ KernelMaxOut<T><<<grid, threads, 0, dev_ctx.stream()>>>(nthreads,
  input_data,
  input_channels,
  input_height,
@@ -140,7 +140,7 @@ void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
 
 template <typename DeviceContext, typename T>
 void MaxOutGradFunctor<DeviceContext, T>::operator()(
- const DeviceContext& context,
+ const DeviceContext& dev_ctx,
  const phi::DenseTensor& input,
  phi::DenseTensor* input_grad,
  const phi::DenseTensor& output,
@@ -154,13 +154,13 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
  const T* input_data = input.data<T>();
  const T* output_data = output.data<T>();
  const T* output_grad_data = output_grad.data<T>();
- T* input_grad_data = context.template Alloc<T>(input_grad);
+ T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
  int64_t nthreads = static_cast<int64_t>(output.numel());
  int64_t blocks = static_cast<int64_t>((nthreads + 1024 - 1) / 1024);
  dim3 threads(1024, 1);
  dim3 grid(blocks, 1);
 
- KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(nthreads,
+ KernelMaxoutGrad<T><<<grid, threads, 0, dev_ctx.stream()>>>(nthreads,
  input_data,
  output_data,
  output_grad_data,