PaddlePaddle · lshpku · Jun 26, 2025 · Jun 16, 2025 · Jun 22, 2025 · Jun 23, 2025
diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -114,8 +114,9 @@ struct LogAddExp {
  template <typename T>
  __host__ __device__ __forceinline__ T operator()(const T& a,
  const T& b) const {
- return std::log(1 + std::exp(std::min(a, b) - std::max(a, b))) +
- std::max(a, b);
+ T min_val = std::min(a, b);
+ T max_val = std::max(a, b);
+ return std::log1p(std::exp(min_val - max_val)) + max_val;
  }
 };
 
@@ -325,7 +326,7 @@ void ScanKernel(const Context& dev_ctx,
  for (size_t i = axis + 1; i < out_dims.size(); i++) {
  width *= out_dims[i];
  }
- int scan_size = out_dims[axis];
+ int64_t scan_size = out_dims[axis];
  bool transpose = (axis != out_dims.size() - 1);
 
  DenseTensor tmp_tensor;
@@ -341,7 +342,7 @@ void ScanKernel(const Context& dev_ctx,
  int64_t max_grid_x = dev_ctx.GetCUDAMaxGridDimSize()[0];
 
  // Do pre-process transpose
- int tile_size = 32;
+ int64_t tile_size = 32;
  dim3 blocks(32, 8);
  int64_t transpose_grids = ((width + tile_size - 1) / tile_size) *
  ((height + tile_size - 1) / tile_size);