PaddlePaddle
diff --git a/‎paddle/phi/kernels/gpu/cum_kernel.cu‎
Lines changed: 55 additions & 14 deletions b/‎paddle/phi/kernels/gpu/cum_kernel.cu‎
Lines changed: 55 additions & 14 deletions
@@ -128,18 +128,36 @@ struct Identity<T, ComplexSum> {
  static constexpr T value = {0, 0};
 };
 
+template <typename T, typename Op, bool UseKahan>
+struct BlockPrefixCallbackOp;
+
 template <typename T, typename Op>
-struct BlockPrefixCallbackOp {
+struct BlockPrefixCallbackOp<T, Op, false> {
  // Running prefix
  T running_total_;
- T compensation_;
  Op op_;
 
  __device__ BlockPrefixCallbackOp(T identity, Op op)
- : running_total_(identity), compensation_(identity), op_(op) {}
+ : running_total_(identity), op_(op) {}
 
  // Callback operator to be entered by the first warp of threads in the block.
  // tid 0 is responsible for returning a value for seeding the block-wide scan.
+ __device__ T operator()(T block_aggregate) {
+ const T old_prefix = running_total_;
+ running_total_ = op_(running_total_, block_aggregate);
+ return old_prefix;
+ }
+};
+
+template <typename T, typename Op>
+struct BlockPrefixCallbackOp<T, Op, true> {
+ T running_total_;
+ T compensation_;
+ Op op_;
+
+ __device__ BlockPrefixCallbackOp(T identity, Op op)
+ : running_total_(identity), compensation_(static_cast<T>(0.0)), op_(op) {}
+
  __device__ T operator()(T block_aggregate) {
  T old_prefix = running_total_;
 
@@ -155,20 +173,23 @@ struct BlockPrefixCallbackOp {
 };
 
 template <typename T>
-struct BlockPrefixCallbackOp<T, LogAddExp> {
+struct BlockPrefixCallbackOp<T, LogAddExp, true> {
  T max_so_far_;
  T scaled_sum_;
  T compensation_;
  LogAddExp op_;
 
  __device__ BlockPrefixCallbackOp(T identity, LogAddExp op)
- : max_so_far_(identity), scaled_sum_(0.0), compensation_(0.0), op_(op) {}
+ : max_so_far_(identity),
+ scaled_sum_(static_cast<T>(0.0)),
+ compensation_(static_cast<T>(0.0)),
+ op_(op) {}
 
  __device__ T operator()(T block_aggregate) {
  if (scaled_sum_ == 0.0) {
  max_so_far_ = block_aggregate;
- scaled_sum_ = 1.0;
- compensation_ = 0.0;
+ scaled_sum_ = static_cast<T>(1.0);
+ compensation_ = static_cast<T>(0.0);
  return std::numeric_limits<T>::lowest();
  }
 
@@ -195,15 +216,19 @@ struct BlockPrefixCallbackOp<T, LogAddExp> {
  }
 };
 
-template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD, typename Op>
+template <typename T,
+ int BLOCK_THREADS,
+ int ITEMS_PER_THREAD,
+ typename Op,
+ bool UseKahan>
 __global__ void BlockScanKernel(T* d_out,
  const T* d_in,
  int64_t grid_size,
  int64_t scan_size,
  bool exclusive,
  Op op) {
  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
- using CallbackOp = BlockPrefixCallbackOp<MT, Op>;
+ using CallbackOp = BlockPrefixCallbackOp<MT, Op, UseKahan>;
 
  // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
  using BlockLoadT = cub::
@@ -350,14 +375,30 @@ void ScanKernel(const Context& dev_ctx,
  }
  }
 
+ // When scan_size is large, switch to Kahan scan to get better precision
+ constexpr int64_t KAHAN_SWITCH_LENGTH = 1 << 16;
+
  // Do scan
  if (!transpose && !reverse) {
- BlockScanKernel<T, 128, 4, Op><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
- out_data, in_data, grid_size, scan_size, exclusive, op);
-
+ if (scan_size > KAHAN_SWITCH_LENGTH) {
+ BlockScanKernel<T, 128, 4, Op, true>
+ <<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+ out_data, in_data, grid_size, scan_size, exclusive, op);
+ } else {
+ BlockScanKernel<T, 128, 4, Op, false>
+ <<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+ out_data, in_data, grid_size, scan_size, exclusive, op);
+ }
  } else {
- BlockScanKernel<T, 128, 4, Op><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
- next_out_data, next_in_data, grid_size, scan_size, exclusive, op);
+ if (scan_size > KAHAN_SWITCH_LENGTH) {
+ BlockScanKernel<T, 128, 4, Op, true>
+ <<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+ next_out_data, next_in_data, grid_size, scan_size, exclusive, op);
+ } else {
+ BlockScanKernel<T, 128, 4, Op, false>
+ <<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+ next_out_data, next_in_data, grid_size, scan_size, exclusive, op);
+ }
  }
  swap_ptr(next_in_data, next_out_data);