PaddlePaddle · wanghuancoder · Aug 7, 2025 · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025
diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h
@@ -38,10 +38,10 @@ void ReduceGradFunctor(const Context& dev_ctx,
  auto x_dims = input0.dims();
  auto reduced_dims_v = common::vectorize(x_dims);
  std::vector<int> dims_ref = dims;
- Eigen::array<int, D> broadcast_dim;
+ Eigen::array<int64_t, D> broadcast_dim;
  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
 
- int broad_cast_times = 1;
+ int64_t broad_cast_times = 1;
  for (size_t i = 0; i < dims_ref.size(); ++i) {
  if (dims_ref[i] < 0) {
  dims_ref[i] = x_rank + dims_ref[i];
@@ -142,7 +142,7 @@ void LaunchReduceGradKernel(const Context& dev_ctx,
  auto& place = *dev_ctx.eigen_device();
  // *dev_ctx.eigen_device();
  auto broadcast_dim =
- Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
+ Eigen::array<int64_t, 1>({{static_cast<int64_t>(input0->numel())}});
  functor(place,
  &x,
  &x_reduce,

diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -63,6 +63,18 @@ struct PowFunctor {
  Ty p_order_;
 };
 
+template <typename Tx,
+ typename Ty,
+ typename Tout> // Tx is high precision, Tout is low/out precision
+struct PowFunctorHighPrecision {
+ HOSTDEVICE explicit inline PowFunctorHighPrecision(const Ty& p_order)
+ : p_order_(p_order) {}
+ HOSTDEVICE inline Tx operator()(const Tx x) const {
+ return static_cast<Tout>(pow(static_cast<Ty>(x), p_order_));
+ }
+ Ty p_order_;
+};
+
 template <typename T, typename Functor>
 __global__ void ReduceSumWithSubtract(
  const T* x, const T* y, T* out, int64_t N, Functor func) {
@@ -126,16 +138,17 @@ void DistKernel(const Context& dev_ctx,
  DenseTensor intermediate;
  const T* x_ptr = x.data<T>();
  const T* y_ptr = y.data<T>();
+
  T* o_ptr = dev_ctx.template Alloc<T>(out);
  auto stream = dev_ctx.stream();
 
  auto xdim = x.dims();
  if (xdim == y.dims()) { // same shape
- auto n = x.numel();
+ int64_t n = x.numel();
+
  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n);
  intermediate.Resize(common::make_ddim({config.block_per_grid.x}));
  T* i_ptr = dev_ctx.template Alloc<T>(&intermediate);
-
  std::vector<int64_t> axis_dims = {static_cast<int64_t>(-1)};
  std::vector<int> reduce_axis =
  funcs::details::GetReduceDim(axis_dims, xdim.size(), true);
@@ -166,15 +179,23 @@ void DistKernel(const Context& dev_ctx,
  ReduceSumWithSubtract<T>
  <<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
  x_ptr, y_ptr, i_ptr, n, OtherOrderFunctor<T, MT>(p_order));
- phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<MT>>(
- dev_ctx, intermediate, out, kps::IdentityFunctor<MT>(), reduce_axis);
-
- const DenseTensor* tmp_norm = out;
- std::vector<const DenseTensor*> ins = {tmp_norm};
+ DenseTensor out_other;
+ out_other.Resize(out->dims());
+ dev_ctx.template Alloc<MT>(&out_other);
+
+ phi::funcs::
+ ReduceKernel<T, MT, kps::AddFunctor, kps::IdentityFunctor<MT>>(
+ dev_ctx,
+ intermediate,
+ &out_other,
+ kps::IdentityFunctor<MT>(),
+ reduce_axis);
+ std::vector<const DenseTensor*> ins = {&out_other};
  std::vector<DenseTensor*> outs = {out};
- MT p_order_ = static_cast<MT>(static_cast<MT>(1.) / p_order);
+
+ MT p_order_ = static_cast<MT>(1.f / p_order);
  phi::funcs::ElementwiseKernel<T>(
- dev_ctx, ins, &outs, PowFunctor<T, MT>(p_order_));
+ dev_ctx, ins, &outs, PowFunctorHighPrecision<MT, MT, T>(p_order_));
  }
 
  } else {

diff --git a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu
@@ -42,10 +42,12 @@ struct AbsMaxAndMinGradFunctor {
 
 template <typename T>
 struct PNormGradFunctor {
+ using MT = typename phi::dtype::MPTypeTrait<T>::Type;
  HOSTDEVICE explicit inline PNormGradFunctor(float porder, float eps) {
- this->porder = static_cast<T>(porder - 1.);
- this->eps = static_cast<T>(eps);
+ this->porder = static_cast<MT>(porder - 1.);
+ this->eps = static_cast<MT>(eps);
  }
+
  template <typename Context,
  typename X,
  typename Y,
@@ -59,12 +61,33 @@ struct PNormGradFunctor {
  DY* dy,
  const Dim& dim,
  int size) {
+ auto x_mt = x->template cast<MT>();
+ auto y_mt = y->template cast<MT>();
+ auto dy_mt = dy->template cast<MT>();
+
+ auto norm_pow = y_mt.pow(-this->porder);
+ auto mask_norm_nonzero = (y_mt != static_cast<MT>(0)).template cast<MT>();
+
+ // Set to 0 where porder < 0 and x == 0
+ MT zero = static_cast<MT>(0);
+ auto mask_x_zero = (x_mt == zero).template cast<MT>();
+
+ MT is_porder_negative =
+ this->porder < zero ? static_cast<MT>(1) : static_cast<MT>(0);
+ auto invalid_mask = (mask_x_zero * is_porder_negative);
+ auto safe_pow =
+ x_mt.abs().pow(this->porder) * (static_cast<MT>(1) - invalid_mask);
+
  dx->device(place) =
- (*x).abs().pow(this->porder) * (*x).sign() * dy->broadcast(dim) *
- (*y + y->constant(eps)).pow(-this->porder).broadcast(dim);
+ (safe_pow * x_mt.sign() * dy_mt.broadcast(dim) *
+ norm_pow.broadcast(dim) *
+ mask_norm_nonzero.broadcast(dim) // Mask out positions where norm == 0
+ )
+ .template cast<T>();
  }
- T porder;
- T eps;
+
+ MT porder;
+ MT eps;
 };
 
 template <typename T, typename Context>

diff --git a/paddle/phi/kernels/gpu/p_norm_kernel.cu b/paddle/phi/kernels/gpu/p_norm_kernel.cu
@@ -124,31 +124,38 @@ void PNormKernel(const Context& dev_ctx,
  phi::funcs::ElementwiseKernel<T>(
  dev_ctx, ins, &outs, UnsignedPowFunctor<T>(1. / porder));
 #else
+ DenseTensor out_temp;
+ out_temp.Resize(out_norm->dims());
+ dev_ctx.template Alloc<MT>(&out_temp);
+
  if (porder == 1.0) {
  // fast 1-norm
  phi::funcs::ReduceKernel<T, T, kps::AddFunctor, FabsFunctor<T>>(
  dev_ctx, *in_x, out_norm, FabsFunctor<T>(), reduce_axis);
  } else if (porder == 2.0) {
  // fast 2-norm
- phi::funcs::ReduceKernel<T, T, kps::AddFunctor, SquareFunctor<T>>(
- dev_ctx, *in_x, out_norm, SquareFunctor<T>(), reduce_axis);
+ phi::funcs::ReduceKernel<T, MT, kps::AddFunctor, SquareFunctor<MT>>(
+ dev_ctx, *in_x, &out_temp, SquareFunctor<MT>(), reduce_axis);
  } else if (porder == 3.0) {
  // fast 3-norm
- phi::funcs::ReduceKernel<T, T, kps::AddFunctor, FabsCubicFunctor<T>>(
- dev_ctx, *in_x, out_norm, FabsCubicFunctor<T>(), reduce_axis);
+ phi::funcs::ReduceKernel<T, MT, kps::AddFunctor, FabsCubicFunctor<MT>>(
+ dev_ctx, *in_x, &out_temp, FabsCubicFunctor<MT>(), reduce_axis);
  } else {
  // vanilla norm
- phi::funcs::ReduceKernel<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
- dev_ctx, *in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis);
+ phi::funcs::ReduceKernel<T, MT, kps::AddFunctor, UnsignedPowFunctor<MT>>(
+ dev_ctx,
+ *in_x,
+ &out_temp,
+ UnsignedPowFunctor<MT>(porder),
+ reduce_axis);
  }
 
  if (porder != 1.0) {
- // save computation when porder is 1.0
- const DenseTensor* tmp_norm = out_norm;
- std::vector<const DenseTensor*> ins = {tmp_norm};
+ std::vector<const DenseTensor*> ins = {&out_temp};
  std::vector<DenseTensor*> outs = {out_norm};
+ MT p_order_ = static_cast<MT>(1.f / porder);
  phi::funcs::ElementwiseKernel<T>(
- dev_ctx, ins, &outs, UnsignedPowFunctor<T>(1. / porder));
+ dev_ctx, ins, &outs, UnsignedPowFunctor<MT>(p_order_));
  }
 #endif
  }