PaddlePaddle · wanghuancoder · Jul 30, 2025 · Jul 29, 2025
diff --git a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
@@ -78,7 +78,7 @@ void BaddbmmGradKernel(const Context& dev_ctx,
  in_dims = {input.dims()[0], 1, input.dims()[1]};
  input_grad->Resize(in_dims);
  }
- int total_elems = 0;
+ int64_t total_elems = 0;
 
  VLOG(3) << "alpha: " << alpha << " beta: " << beta;
 
@@ -212,7 +212,7 @@ void BaddbmmGradKernel(const Context& dev_ctx,
  total_elems = x.dims()[0] * x.dims()[1] * x.dims()[2];
  // x_grad = out_grad * y'. x_grad: B x M x K, out_grad : B x M x N, y : B x
  // K x N
- for (int i = 0; i < x.dims()[0]; ++i) {
+ for (int64_t i = 0; i < x.dims()[0]; ++i) {
  auto out_grad_slice = out_grad.Slice(i, i + 1);
  auto y_slice = y.Slice(i, i + 1);
  auto x_grad_slice = x_grad->Slice(i, i + 1);
@@ -238,7 +238,7 @@ void BaddbmmGradKernel(const Context& dev_ctx,
  total_elems = y.dims()[0] * y.dims()[1] * y.dims()[2];
  // y_grad = x' * out_grad. y_grad: B x K x N, out_grad : B x M x N, x : B x
  // M x K
- for (int i = 0; i < x.dims()[0]; ++i) {
+ for (int64_t i = 0; i < x.dims()[0]; ++i) {
  auto out_grad_slice = out_grad.Slice(i, i + 1);
  auto x_slice = x.Slice(i, i + 1);
  auto y_grad_slice = y_grad->Slice(i, i + 1);

diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -520,8 +520,8 @@ void MatMulFunctionImplWithCublasLt(
  using blaslt = phi::funcs::MatmulWithCublasLt<T>;
 
  if (x_ndim == 1 && y_ndim == 1) {
- const int M = X.numel();
- const int N = Y.numel();
+ const int64_t M = X.numel();
+ const int64_t N = Y.numel();
  PADDLE_ENFORCE_EQ(
  M,
  N,
@@ -550,7 +550,7 @@ void MatMulFunctionImplWithCublasLt(
  }
 
  if (x_ndim == 1) {
- const int N = X.numel();
+ const int64_t N = X.numel();
  if (trans_y) {
  PADDLE_ENFORCE_EQ(
  y_dims[y_ndim - 1],
@@ -633,7 +633,7 @@ void MatMulFunctionImplWithCublasLt(
  }
 
  if (y_ndim == 1) {
- const int N = Y.numel();
+ const int64_t N = Y.numel();
  if (trans_x) {
  PADDLE_ENFORCE_EQ(
  x_dims[x_ndim - 2],
@@ -1008,8 +1008,8 @@ bool inline MatMulInt8Function(const phi::GPUContext& dev_ctx,
  return false;
  }
 #if CUDA_VERSION >= 11060
- const int x_ndim = x_dims.size();
- const int y_ndim = y_dims.size();
+ const int64_t x_ndim = x_dims.size();
+ const int64_t y_ndim = y_dims.size();
  const int8_t* x_data = x.data<int8_t>();
  const int8_t* y_data = y.data<int8_t>();
  using blaslt = phi::funcs::MatmulWithCublasLt<int8_t, int32_t>;
@@ -1027,8 +1027,8 @@ bool inline MatMulInt8Function(const phi::GPUContext& dev_ctx,
  /* no_exchange */ true);
 
  if (x_ndim == 1 && y_ndim == 1) {
- const int M = x.numel();
- const int N = y.numel();
+ const int64_t M = x.numel();
+ const int64_t N = y.numel();
  PADDLE_ENFORCE_EQ(
  M,
  N,
@@ -1057,7 +1057,7 @@ bool inline MatMulInt8Function(const phi::GPUContext& dev_ctx,
  return true;
  }
  if (x_ndim == 1) {
- const int N = x.numel();
+ const int64_t N = x.numel();
  if (trans_y) {
  PADDLE_ENFORCE_EQ(
  y_dims[y_ndim - 1],
@@ -1083,7 +1083,7 @@ bool inline MatMulInt8Function(const phi::GPUContext& dev_ctx,
  N,
  y_ndim - 2,
  y_dims[y_ndim - 2]));
- const int M = y.numel() / N;
+ const int64_t M = y.numel() / N;
  if (!(M == 1 || M % 4 == 0)) {
  return false;
  }
@@ -1098,7 +1098,7 @@ bool inline MatMulInt8Function(const phi::GPUContext& dev_ctx,
  out->ResizeAndAllocate(common::make_ddim(out_dims));
  dev_ctx.template Alloc<int32_t>(out);
  if (trans_y) {
- const int M = y.numel() / N;
+ const int64_t M = y.numel() / N;
  blaslt::Run(dev_ctx,
  y_data,
  x_data,
@@ -1110,7 +1110,7 @@ bool inline MatMulInt8Function(const phi::GPUContext& dev_ctx,
  false,
  &matmul_planner);
  } else {
- const int M = y_dims[y_ndim - 1];
+ const int64_t M = y_dims[y_ndim - 1];
  const int batch_size = y.numel() / (M * N);
  if (batch_size == 1) {
  blaslt::Run(dev_ctx,
@@ -1144,7 +1144,7 @@ bool inline MatMulInt8Function(const phi::GPUContext& dev_ctx,
  }
 
  if (y_ndim == 1) {
- const int N = y.numel();
+ const int64_t N = y.numel();
  if (trans_x) {
  PADDLE_ENFORCE_EQ(
  x_dims[x_ndim - 2],
@@ -1156,7 +1156,7 @@ bool inline MatMulInt8Function(const phi::GPUContext& dev_ctx,
  N,
  x_ndim - 2,
  x_dims[x_ndim - 2]));
- const int M = x.numel() / N;
+ const int64_t M = x.numel() / N;
  if (!((M == 1 || M % 4 == 0))) {
  return false;
  }
@@ -1459,8 +1459,8 @@ bool inline MatMulInt8Function(const phi::GPUContext& dev_ctx,
  const int8_t* y_data = y.data<int8_t>();
 
  if (x_ndim == 1 && y_ndim == 1) {
- const int M = x.numel();
- const int N = y.numel();
+ const int64_t M = x.numel();
+ const int64_t N = y.numel();
  PADDLE_ENFORCE_EQ(
  M,
  N,
@@ -1488,7 +1488,7 @@ bool inline MatMulInt8Function(const phi::GPUContext& dev_ctx,
  }
 
  if (x_ndim == 1) {
- const int N = x.numel();
+ const int64_t N = x.numel();
  if (trans_y) {
  PADDLE_ENFORCE_EQ(
  y_dims[y_ndim - 1],
@@ -1569,7 +1569,7 @@ bool inline MatMulInt8Function(const phi::GPUContext& dev_ctx,
  }
 
  if (y_ndim == 1) {
- const int N = y.numel();
+ const int64_t N = y.numel();
  if (trans_x) {
  PADDLE_ENFORCE_EQ(
  x_dims[x_ndim - 2],

diff --git a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
@@ -111,7 +111,7 @@ inline DenseTensor MatChainMul(const Context& dev_ctx,
 template <typename Context, typename T>
 std::vector<uint64_t> GetOrder(const std::vector<const DenseTensor*>& ins,
  const std::vector<phi::DDim>& ins_dims) {
- auto n = ins.size();
+ uint64_t n = ins.size();
  // p: save the ins shape, the ins[i] shape is (p[i], p[i+1])
  std::vector<uint64_t> p(n + 1);
  for (uint64_t i = 0; i < n; i++) {
@@ -329,7 +329,7 @@ void MultiDotGradMatChainOrder(const Context& dev_ctx,
  std::vector<DenseTensor*>* dx) {
  auto order = GetOrder<Context, T>(ins, ins_dims);
  auto n = ins.size();
- std::vector<DenseTensor> results(n * n);
+ std::vector<DenseTensor> results(static_cast<int64_t>(n) * n);
  MatChainMul<Context, T>(
  dev_ctx, ins, ins_dims, order, 0, n - 1, true, &results);
  MatChainMulGrad<Context, T>(