PaddlePaddle
diff --git a/‎paddle/phi/kernels/funcs/gather.cu.h‎
Lines changed: 3 additions & 2 deletions b/‎paddle/phi/kernels/funcs/gather.cu.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/gpu/elementwise_grad.h‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/gpu/elementwise_grad.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/gpu/gather_grad_kernel.cu‎
Lines changed: 2 additions & 0 deletions b/‎paddle/phi/kernels/gpu/gather_grad_kernel.cu‎
Lines changed: 2 additions & 0 deletions
@@ -196,8 +196,9 @@ __global__ void GatherGradGPUKernel(const T* input,
  int64_t input_index_dim_size,
  int64_t out_index_dim_size,
  int64_t size) {
- int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
- for (; idx < size; idx += blockDim.x * gridDim.x) {
+ int64_t idx = static_cast<int64_t>(blockDim.x) * blockIdx.x + threadIdx.x;
+ const int64_t stride = static_cast<int64_t>(blockDim.x) * gridDim.x;
+ for (; idx < size; idx += stride) {
  int64_t inner_dim_index = idx / (outer_dim_size * input_index_dim_size);
  int64_t next_idx = idx % (outer_dim_size * input_index_dim_size);
  int64_t index_dim_index = next_idx / (outer_dim_size);
 
@@ -243,14 +243,14 @@ static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout,
  int64_t size,
  T *dx,
  T *dy) {
- int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
+ int64_t col = static_cast<int64_t>(BLOCK_ID_X) * BLOCK_NUM_X + THREAD_ID_X;
 
  while (col < size) {
  if (dx != nullptr) {
  dx[col] = dout[col];
  }
  dy[col] = -dout[col];
- col += BLOCK_NUM_X * GRID_NUM_X;
+ col += static_cast<int64_t>(BLOCK_NUM_X) * GRID_NUM_X;
  }
 }
 
 
@@ -34,6 +34,7 @@ void GatherGradKernel(const Context& dev_ctx,
  if (axis_v < 0) {
  axis_v += static_cast<int>(x.dims().size());
  }
+
  if (axis_v != 0) {
  if (index_type == DataType::INT32) {
  phi::funcs::GatherV2GradCUDAFunction<T, int32_t>(
@@ -44,6 +45,7 @@ void GatherGradKernel(const Context& dev_ctx,
  }
  return;
  }
+
  dev_ctx.template Alloc<T>(x_grad);
  auto dxt = EigenVector<T>::Flatten(*x_grad);
  auto& place = *dev_ctx.eigen_device();
Original file line number	Diff line number	Diff line change
`@@ -243,14 +243,14 @@ static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout,`
`243`	`243`	`int64_t size,`
`244`	`244`	`T *dx,`
`245`	`245`	`T *dy) {`
`246`		`- int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;`
	`246`	`+ int64_t col = static_cast<int64_t>(BLOCK_ID_X) * BLOCK_NUM_X + THREAD_ID_X;`
`247`	`247`
`248`	`248`	`while (col < size) {`
`249`	`249`	`if (dx != nullptr) {`
`250`	`250`	`dx[col] = dout[col];`
`251`	`251`	`}`
`252`	`252`	`dy[col] = -dout[col];`
`253`		`- col += BLOCK_NUM_X * GRID_NUM_X;`
	`253`	`+ col += static_cast<int64_t>(BLOCK_NUM_X) * GRID_NUM_X;`
`254`	`254`	`}`
`255`	`255`	`}`
`256`	`256`
Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,7 @@ void GatherGradKernel(const Context& dev_ctx,`
`34`	`34`	`if (axis_v < 0) {`
`35`	`35`	`axis_v += static_cast<int>(x.dims().size());`
`36`	`36`	`}`
	`37`	`+`
`37`	`38`	`if (axis_v != 0) {`
`38`	`39`	`if (index_type == DataType::INT32) {`
`39`	`40`	`phi::funcs::GatherV2GradCUDAFunction<T, int32_t>(`
`@@ -44,6 +45,7 @@ void GatherGradKernel(const Context& dev_ctx,`
`44`	`45`	`}`
`45`	`46`	`return;`
`46`	`47`	`}`
	`48`	`+`
`47`	`49`	`dev_ctx.template Alloc<T>(x_grad);`
`48`	`50`	`auto dxt = EigenVector<T>::Flatten(*x_grad);`
`49`	`51`	`auto& place = *dev_ctx.eigen_device();`