PaddlePaddle · luotao1 · Jul 16, 2025 · Jul 14, 2025 · Jul 14, 2025
diff --git a/paddle/phi/kernels/funcs/batch_norm_utils.h b/paddle/phi/kernels/funcs/batch_norm_utils.h
@@ -23,7 +23,7 @@ namespace phi {
 using Tensor = DenseTensor;
 
 template <typename DeviceContext, typename T>
-inline void ResizeToChannelFirst(const DeviceContext& context,
+inline void ResizeToChannelFirst(const DeviceContext& dev_ctx,
  const Tensor* input,
  Tensor* transformed_input) {
  int dim = input->dims().size() - 2;
@@ -37,7 +37,7 @@ inline void ResizeToChannelFirst(const DeviceContext& context,
  in_dims_vec[3] = input->dims()[2];
  in_dims_vec[4] = input->dims()[3];
  transformed_input->Resize(common::make_ddim(in_dims_vec));
- context.template Alloc<T>(transformed_input);
+ dev_ctx.template Alloc<T>(transformed_input);
  } else if (dim == 2) {
  // input
  transformed_input->Resize(input->dims());
@@ -47,20 +47,20 @@ inline void ResizeToChannelFirst(const DeviceContext& context,
  in_dims_vec[2] = input->dims()[1];
  in_dims_vec[3] = input->dims()[2];
  transformed_input->Resize(common::make_ddim(in_dims_vec));
- context.template Alloc<T>(transformed_input);
+ dev_ctx.template Alloc<T>(transformed_input);
  } else if (dim == 1) {
  transformed_input->Resize(input->dims());
 
  auto in_dims_vec = common::vectorize(input->dims());
  in_dims_vec[1] = input->dims()[2];
  in_dims_vec[2] = input->dims()[1];
  transformed_input->Resize(common::make_ddim(in_dims_vec));
- context.template Alloc<T>(transformed_input);
+ dev_ctx.template Alloc<T>(transformed_input);
  }
 }
 
 template <typename DeviceContext, typename T>
-inline void ResizeToChannelLast(const DeviceContext& context,
+inline void ResizeToChannelLast(const DeviceContext& dev_ctx,
  const Tensor* input,
  Tensor* transformed_input) {
  int dim = input->dims().size() - 2;
@@ -74,7 +74,7 @@ inline void ResizeToChannelLast(const DeviceContext& context,
  in_dims_vec[3] = input->dims()[4];
  in_dims_vec[4] = input->dims()[1];
  transformed_input->Resize(common::make_ddim(in_dims_vec));
- context.template Alloc<T>(transformed_input);
+ dev_ctx.template Alloc<T>(transformed_input);
 
  } else if (dim == 2) {
  // input
@@ -85,58 +85,58 @@ inline void ResizeToChannelLast(const DeviceContext& context,
  in_dims_vec[2] = input->dims()[3];
  in_dims_vec[3] = input->dims()[1];
  transformed_input->Resize(common::make_ddim(in_dims_vec));
- context.template Alloc<T>(transformed_input);
+ dev_ctx.template Alloc<T>(transformed_input);
  } else if (dim == 1) {
  transformed_input->Resize(input->dims());
 
  auto in_dims_vec = common::vectorize(input->dims());
  in_dims_vec[1] = input->dims()[2];
  in_dims_vec[2] = input->dims()[1];
  transformed_input->Resize(common::make_ddim(in_dims_vec));
- context.template Alloc<T>(transformed_input);
+ dev_ctx.template Alloc<T>(transformed_input);
  }
 }
 
 template <typename DeviceContext, typename T>
-inline void TransToChannelFirst(const DeviceContext& context,
+inline void TransToChannelFirst(const DeviceContext& dev_ctx,
  const Tensor* input,
  Tensor* transformed_input) {
  VLOG(5) << "Why am I called?";
  int dim = input->dims().size() - 2;
  if (dim == 3) {
  std::vector<int> axis{0, 4, 1, 2, 3};
  phi::funcs::Transpose<DeviceContext, T, 5> trans5;
- trans5(context, *input, transformed_input, axis);
+ trans5(dev_ctx, *input, transformed_input, axis);
 
  } else if (dim == 2) {
  std::vector<int> axis{0, 3, 1, 2};
  phi::funcs::Transpose<DeviceContext, T, 4> trans4;
- trans4(context, *input, transformed_input, axis);
+ trans4(dev_ctx, *input, transformed_input, axis);
  } else if (dim == 1) {
  std::vector<int> axis{0, 2, 1};
  phi::funcs::Transpose<DeviceContext, T, 3> trans3;
- trans3(context, *input, transformed_input, axis);
+ trans3(dev_ctx, *input, transformed_input, axis);
  }
 }
 
 template <typename DeviceContext, typename T>
-inline void TransToChannelLast(const DeviceContext& context,
+inline void TransToChannelLast(const DeviceContext& dev_ctx,
  const Tensor* input,
  Tensor* transformed_input) {
  int dim = input->dims().size() - 2;
  if (dim == 3) {
  std::vector<int> axis{0, 2, 3, 4, 1};
  phi::funcs::Transpose<DeviceContext, T, 5> trans5;
- trans5(context, *input, transformed_input, axis);
+ trans5(dev_ctx, *input, transformed_input, axis);
 
  } else if (dim == 2) {
  std::vector<int> axis{0, 2, 3, 1};
  phi::funcs::Transpose<DeviceContext, T, 4> trans4;
- trans4(context, *input, transformed_input, axis);
+ trans4(dev_ctx, *input, transformed_input, axis);
  } else if (dim == 1) {
  std::vector<int> axis{0, 2, 1};
  phi::funcs::Transpose<DeviceContext, T, 3> trans3;
- trans3(context, *input, transformed_input, axis);
+ trans3(dev_ctx, *input, transformed_input, axis);
  }
 }
 

diff --git a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
@@ -285,10 +285,10 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
 }
 
 template <typename T, typename Context>
-inline void forward_reset_outputV2(const Context &context,
+inline void forward_reset_outputV2(const Context &dev_ctx,
  phi::funcs::GRUMetaValue<T> value,
  int frame_size) {
- auto &place = *context.eigen_device();
+ auto &place = *dev_ctx.eigen_device();
  auto value_reset_gate =
  typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
  auto value_update_gate = typename EigenVector<T>::Type(
@@ -310,11 +310,11 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
  int batch_size,
  ActivationType active_gate,
  bool old_version = true,
- const Context *context = nullptr) {
+ const Context *dev_ctx = nullptr) {
  for (int b = 0; b < batch_size; b++) {
  if (!old_version) {
  // use eigen
- forward_reset_outputV2(*context, value, frame_size);
+ forward_reset_outputV2(*dev_ctx, value, frame_size);
  } else {
  if (OpResetOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
  (sizeof(T) == 4)) {
@@ -346,10 +346,10 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
 }
 
 template <typename T, typename Context>
-inline void forward_final_outputV2(const Context &context,
+inline void forward_final_outputV2(const Context &dev_ctx,
  phi::funcs::GRUMetaValue<T> value,
  int frame_size) {
- auto &place = *context.eigen_device();
+ auto &place = *dev_ctx.eigen_device();
  auto value_update_gate = typename EigenVector<T>::Type(
  value.gate_value + frame_size, Array1(frame_size));
  auto value_frame_state = typename EigenVector<T>::Type(
@@ -375,11 +375,11 @@ inline void forward_final_output(OpFinalOutput op_final_output,
  ActivationType active_node,
  bool origin_mode,
  bool old_version = true,
- const Context *context = nullptr) {
+ const Context *dev_ctx = nullptr) {
  for (int b = 0; b < batch_size; b++) {
  if (!old_version) {
  // eigen
- forward_final_outputV2(*context, value, frame_size);
+ forward_final_outputV2(*dev_ctx, value, frame_size);
  } else {
  if (OpFinalOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
  (sizeof(T) == 4)) {
@@ -866,11 +866,11 @@ inline void backward_reset_grad(OpResetGrad op_reset_grad,
 }
 
 template <typename T, typename Context>
-inline void gru_backward(const Context &context,
+inline void gru_backward(const Context &dev_ctx,
  phi::funcs::GRUMetaValue<T> value,
  phi::funcs::GRUMetaGrad<T> grad,
  int frame_size) {
- auto &place = *context.eigen_device();
+ auto &place = *dev_ctx.eigen_device();
 
  auto value_reset_gate =
  typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
@@ -931,7 +931,7 @@ inline void gru_backward(const Context &context,
 }
 
 template <class OpGruGrad, typename T, typename Context>
-inline void cpu_gru_backward(const Context &context,
+inline void cpu_gru_backward(const Context &dev_ctx,
  OpGruGrad op_gru_grad UNUSED,
  phi::funcs::GRUMetaValue<T> value,
  phi::funcs::GRUMetaGrad<T> grad,
@@ -941,7 +941,7 @@ inline void cpu_gru_backward(const Context &context,
  ActivationType active_gate UNUSED) {
  for (int b = 0; b < batch_size; ++b) {
  // eigen
- gru_backward(context, value, grad, frame_size);
+ gru_backward(dev_ctx, value, grad, frame_size);
 
  value.gate_value += frame_size * 3;
  value.reset_output_value += frame_size;

diff --git a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
@@ -411,7 +411,7 @@ void avx_lstm_backward_one_sequence(Op op,
 }
 
 template <class T, class Context>
-void eigen_lstm_forward_one_sequence(const Context &context,
+void eigen_lstm_forward_one_sequence(const Context &dev_ctx,
  phi::funcs::LstmMetaValue<T> value,
  int frame_size) {
  auto eigen_value_ig =
@@ -429,7 +429,7 @@ void eigen_lstm_forward_one_sequence(const Context &context,
  auto eigen_output =
  typename EigenVector<T>::Type(value.output_value, Array1(frame_size));
 
- auto &place = *context.eigen_device();
+ auto &place = *dev_ctx.eigen_device();
  TanhFunctor<T>()(place, eigen_value_in, eigen_value_in);
  SigmoidFunctor<T>()(place, eigen_value_ig, eigen_value_ig);
  SigmoidFunctor<T>()(place, eigen_value_fg, eigen_value_fg);
@@ -447,7 +447,7 @@ void eigen_lstm_forward_one_sequence(const Context &context,
 }
 
 template <class T, class Context>
-void eigen_lstm_backward_one_sequence(const Context &context,
+void eigen_lstm_backward_one_sequence(const Context &dev_ctx,
  phi::funcs::LstmMetaValue<T> value,
  phi::funcs::LstmMetaGrad<T> grad,
  int frame_size) {
@@ -475,7 +475,7 @@ void eigen_lstm_backward_one_sequence(const Context &context,
  auto eigen_grad_state =
  typename EigenVector<T>::Type(grad.state_grad, Array1(frame_size));
 
- auto &place = *context.eigen_device();
+ auto &place = *dev_ctx.eigen_device();
  SigmoidGradFunctor<T>()(place,
  1 /*useless*/,
  eigen_value_og,
@@ -514,7 +514,7 @@ void eigen_lstm_backward_one_sequence(const Context &context,
 }
 
 template <class T, class Op, class Context>
-void cpu_lstm_forward(const Context &context,
+void cpu_lstm_forward(const Context &dev_ctx,
  Op op,
  phi::funcs::LstmMetaValue<T> value,
  int frame_size,
@@ -524,7 +524,7 @@ void cpu_lstm_forward(const Context &context,
  ActivationType active_state,
  bool old_api_version) {
  if (!old_api_version) {
- eigen_lstm_forward_one_sequence<T>(context, value, frame_size);
+ eigen_lstm_forward_one_sequence<T>(dev_ctx, value, frame_size);
  } else {
  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
  avx_lstm_forward_one_sequence<T>(op,
@@ -549,7 +549,7 @@ void cpu_lstm_forward(const Context &context,
 }
 
 template <class T, class Op, class Context>
-void cpu_lstm_backward(const Context &context,
+void cpu_lstm_backward(const Context &dev_ctx,
  Op op,
  phi::funcs::LstmMetaValue<T> value,
  phi::funcs::LstmMetaGrad<T> grad,
@@ -560,7 +560,7 @@ void cpu_lstm_backward(const Context &context,
  ActivationType active_state,
  bool old_api_version) {
  if (!old_api_version) {
- eigen_lstm_backward_one_sequence<T>(context, value, grad, frame_size);
+ eigen_lstm_backward_one_sequence<T>(dev_ctx, value, grad, frame_size);
  } else {
  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
  avx_lstm_backward_one_sequence<T>(op,

diff --git a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
@@ -218,7 +218,7 @@ __global__ void KeLstmBackward(Op op,
 }
 
 template <class T, class Op>
-void gpu_lstm_forward(const phi::DeviceContext& context,
+void gpu_lstm_forward(const phi::DeviceContext& dev_ctx,
  Op op,
  phi::funcs::LstmMetaValue<T> value,
  int frame_size,
@@ -240,7 +240,7 @@ void gpu_lstm_forward(const phi::DeviceContext& context,
  grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
  }
 
- auto stream = reinterpret_cast<const phi::GPUContext&>(context).stream();
+ auto stream = reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
  if (batch_size == 1) {
  KeLstmForward<T,
  Op,
@@ -269,7 +269,7 @@ void gpu_lstm_forward(const phi::DeviceContext& context,
 }
 
 template <class T, class Op>
-void gpu_lstm_backward(const phi::DeviceContext& context,
+void gpu_lstm_backward(const phi::DeviceContext& dev_ctx,
  Op op,
  phi::funcs::LstmMetaValue<T> value,
  phi::funcs::LstmMetaGrad<T> grad,
@@ -292,7 +292,7 @@ void gpu_lstm_backward(const phi::DeviceContext& context,
  grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
  }
 
- auto stream = reinterpret_cast<const phi::GPUContext&>(context).stream();
+ auto stream = reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
  if (batch_size == 1) {
  KeLstmBackward<T,
  Op,

diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h
@@ -63,7 +63,7 @@ struct DiagonalFunctor {
 };
 
 template <typename T, typename DeviceContext>
-DenseTensor Diagonal(const DeviceContext& context,
+DenseTensor Diagonal(const DeviceContext& dev_ctx,
  const DenseTensor* input,
  int64_t offset,
  int64_t dim1,
@@ -105,7 +105,7 @@ DenseTensor Diagonal(const DeviceContext& context,
  DDim diag_dims = common::make_ddim(ret_dims);
  auto dig_stride = common::stride(diag_dims);
  diag.Resize(diag_dims);
- auto diag_data = context.template Alloc<T>(&diag);
+ auto diag_data = dev_ctx.template Alloc<T>(&diag);
 
  int64_t pos = std::abs(offset) * offset_stride;
  int64_t dim_size = ret_strides.size();
@@ -119,8 +119,8 @@ DenseTensor Diagonal(const DeviceContext& context,
  const auto* ret_arr = ret_strides.data();
 #endif
 
- // auto& dev_ctx = context.template device_context<DeviceContext>();
- phi::funcs::ForRange<DeviceContext> for_range(context, diag.numel());
+ // auto& dev_ctx2 = dev_ctx.template device_context<DeviceContext>();
+ phi::funcs::ForRange<DeviceContext> for_range(dev_ctx, diag.numel());
  DiagonalFunctor<T> functor(
  input_data, diag_arr, ret_arr, pos, dim_size, diag_data);
  for_range(functor);