PaddlePaddle · wanghuancoder · Jul 2, 2025 · Jun 30, 2025
diff --git a/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu b/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu
@@ -35,7 +35,7 @@ void GetMaxLenTensor(const phi::GPUContext& dev_ctx,
  max_len_tensor.Resize({{1}});
  auto* max_len_tensor_data = dev_ctx.template Alloc<int>(
  &max_len_tensor, max_len_tensor.numel() * sizeof(int));
- const int bsz = batch_size.dims()[0];
+ const int64_t bsz = batch_size.dims()[0];
  constexpr int blockSize = 128;
  int max_len_cpu = 0;
  GetMaxLenKernel<blockSize><<<1, blockSize, 0, dev_ctx.stream()>>>(

diff --git a/paddle/phi/kernels/fusion/gpu/block_attn.h b/paddle/phi/kernels/fusion/gpu/block_attn.h
@@ -4342,14 +4342,14 @@ struct MaxOp {
 template <int THREADBLOCK_SIZE>
 __global__ void GetMaxLenKernel(const int *seq_lens,
  int *max_len,
- const int batch_size) {
- const int tid = threadIdx.x;
+ const int64_t batch_size) {
+ const int64_t tid = threadIdx.x;
 
  typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
  __shared__ typename BlockReduce::TempStorage temp_storage;
 
  int max_len_this_thread = 0;
- for (int i = tid; i < batch_size; i += blockDim.x) {
+ for (int64_t i = tid; i < batch_size; i += blockDim.x) {
  max_len_this_thread = max(seq_lens[i], max_len_this_thread);
  }
  int total =