PaddlePaddle
diff --git a/‎paddle/phi/kernels/funcs/dropout_impl_util.h‎
Lines changed: 1 addition & 1 deletion b/‎paddle/phi/kernels/funcs/dropout_impl_util.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/phi/kernels/fusion/gpu/fused_dropout_common.h‎
Lines changed: 10 additions & 10 deletions b/‎paddle/phi/kernels/fusion/gpu/fused_dropout_common.h‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h‎
Lines changed: 13 additions & 9 deletions b/‎paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h‎
Lines changed: 1 addition & 1 deletion b/‎paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h‎
Lines changed: 1 addition & 1 deletion
@@ -26,7 +26,7 @@ inline bool GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
  const phi::DenseTensor* seed,
  const bool is_fix_seed,
  const int seed_val,
- const int offset,
+ const uint64_t offset,
  uint64_t* seed_data,
  uint64_t* increment,
  bool use_copy = true) {
 
@@ -56,33 +56,33 @@ inline phi::backends::gpu::GpuLaunchConfig Get1DBlocksAnd2DGrids(
  const uint64_t rows,
  const uint64_t cols,
  const int vec_size) {
- const uint64_t tmp_cols = cols / vec_size;
+ const uint64_t tmp_cols = cols / static_cast<uint64_t>(vec_size);
  // NOTE(wangxi): We set max_block_size to 512, for `FusedResidualDropoutBias`
  // needs too many register resources. If data_type is float16, CUDA
  // error(701) will occur when block_size is 1024. Which error is
  // 'cudaErrorLaunchOutOfResources', this indicates that a launch did not
  // occur because it did not have appropriate resources.
  // Of course, this kernel can be optimized later to reduce the use
  // of registers.
- const int threads =
+ const uint64_t threads =
  std::max(static_cast<uint64_t>(32),
  std::min(tmp_cols,
  static_cast<uint64_t>(
  std::min(dev_ctx.GetMaxThreadsPerBlock(), 512))));
-
- const int blocks_x =
- std::max(static_cast<uint64_t>(1), (tmp_cols + threads - 1) / threads);
+ const uint64_t blocks_x = std::min(
+  static_cast<uint64_t>(65536),
+ std::max(static_cast<uint64_t>(1), (tmp_cols + threads - 1) / threads));
  uint64_t blocks_y = std::max(static_cast<uint64_t>(1), rows);
  int blocks_z = 1;
- if (blocks_y > 65536) {
+ if (blocks_y >= 65536) {
  blocks_z = 1024;
  blocks_y = (blocks_y + blocks_z - 1) / blocks_z;
- blocks_y = blocks_y > 65536 ? 65535 : blocks_y;
+ blocks_y = blocks_y >= 65536 ? 65535 : blocks_y;
  }
  phi::backends::gpu::GpuLaunchConfig config;
- config.block_per_grid.x = blocks_x;
- config.block_per_grid.y = blocks_y;
- config.block_per_grid.z = blocks_z;
+ config.block_per_grid.x = static_cast<uint32_t>(blocks_x);
+ config.block_per_grid.y = static_cast<uint32_t>(blocks_y);
+ config.block_per_grid.z = static_cast<uint32_t>(blocks_z);
  config.thread_per_block.x = threads;
  return config;
 }
 
@@ -72,7 +72,8 @@ struct DropoutParam {
  seed_val = seed_val_;
  }
 
- int UpdateSeedAndIncrement(const phi::GPUContext& dev_ctx, const int offset) {
+ uint64_t UpdateSeedAndIncrement(const phi::GPUContext& dev_ctx,
+ const uint64_t offset) {
  uint64_t tmp_increment;
  phi::funcs::GetSeedDataAndIncrement(dev_ctx,
  tensor_seed,
@@ -81,7 +82,7 @@ struct DropoutParam {
  offset,
  &seed,
  &tmp_increment);
- increment = static_cast<int>(tmp_increment);
+ increment = tmp_increment;
  return increment;
  }
 };
@@ -104,26 +105,29 @@ template <typename T,
  typename OutType = T>
 class FusedDropoutHelper {
  private:
- int GetIncrement(const phi::GPUContext& dev_ctx) {
+ uint64_t GetIncrement(const phi::GPUContext& dev_ctx) {
  const int VecSize = MAX_CACHE_BYTES / sizeof(T);
  const int real_vec_size = cols_ % VecSize == 0 ? VecSize : 1;
  auto config = Get1DBlocksAnd2DGrids(dev_ctx,
  static_cast<uint64_t>(rows_),
  static_cast<uint64_t>(cols_),
  real_vec_size);
- int increment = ((cols_ - 1) / (config.thread_per_block.x *
- config.block_per_grid.x * real_vec_size) +
- 1) *
- real_vec_size;
+ uint64_t increment =
+ ((cols_ - static_cast<uint64_t>(1)) /
+ (static_cast<uint64_t>(config.thread_per_block.x) *
+ static_cast<uint64_t>(config.block_per_grid.x) *
+ static_cast<uint64_t>(real_vec_size)) +
+ static_cast<uint64_t>(1)) *
+ static_cast<uint64_t>(real_vec_size);
  increment = dropout_param_.UpdateSeedAndIncrement(dev_ctx, increment);
  return increment;
  }
 
  public:
  FusedDropoutHelper() {}
  FusedDropoutHelper(const phi::GPUContext& dev_ctx,
- const int rows,
- const int cols,
+ const int64_t rows,
+ const int64_t cols,
  const DropoutParam& dropout_param,
  const float residual_alpha = 1.0) {
  rows_ = rows;
 
@@ -339,7 +339,7 @@ template <typename T,
  typename OutType = T>
 void LaunchResidualDropoutBias(const uint64_t rows,
  const uint64_t cols,
- const int increment,
+ const uint64_t increment,
  uint64_t seed,
  const float dropout_prob,
  const bool is_test,