PaddlePaddle · DanielSun11 · Jul 31, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
@@ -4508,6 +4508,20 @@ void VariableLengthMemoryEfficientAttentionInferMeta(
  true,
  common::errors::InvalidArgument(
  "The seq length of Key, Value should be equal."));
+ if (mask) {
+ PADDLE_ENFORCE_EQ(
+ mask.dims().size(),
+ 4,
+ common::errors::InvalidArgument("Mask should be a 4-D tensor"
+ "But received Value dimension(%s)",
+ mask.dims().size()));
+ const int64_t mask_batch_size = mask.dims()[0];
+ PADDLE_ENFORCE_EQ(
+ query_batch_size == mask_batch_size,
+ true,
+ common::errors::InvalidArgument(
+ "The batch size of Query, Key, Value and Mask should be equal."));
+ }
 
  std::vector<int64_t> out_dims(
  {query_batch_size, query_num_head, query_seq_length, value_head_size});

diff --git a/paddle/phi/kernels/fusion/cutlass/variable_length_memory_efficient_attention_kernel.cu b/paddle/phi/kernels/fusion/cutlass/variable_length_memory_efficient_attention_kernel.cu
@@ -67,7 +67,8 @@ void MultiHeadAttentionVariableForwardKernel(
  params.causal = causal;
  params.pre_cache_length = pre_cache_length;
 
- if (mask) {
+ // if the mask is 0-size tensor, we don't need to set mask_ptr
+ if (mask && mask.get().numel() > 0) {
  // [B, 1, S, D]
  auto mask_tensor = mask.get();
  int64_t mask_num_heads = mask_tensor.dims()[1];

diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu
@@ -38,8 +38,10 @@ void FusedRopeGradKernel(const Context& dev_ctx,
  DenseTensor* dk,
  DenseTensor* dv) {
  int64_t numel = dout_q.numel();
- if (numel <= 0) return;
  dev_ctx.template Alloc<T>(dq);
+ if (dout_k) dev_ctx.template Alloc<T>(dk);
+ if (dout_v) dev_ctx.template Alloc<T>(dv);
+ if (numel <= 0) return;
 
  phi::Array<int64_t, 3> inputs_num_heads;
  // small size for broadcast
@@ -70,22 +72,19 @@ void FusedRopeGradKernel(const Context& dev_ctx,
  outs_data[0] = dq->data<T>();
  int num_inputs = 1;
 
- if (dout_k) {
- dev_ctx.template Alloc<T>(dk);
+ if (dk && dk->numel() > 0) {
  outs_data[num_inputs] = dk->data<T>();
  ins_data[num_inputs] = dout_k->data<T>();
  inputs_num_heads[num_inputs] = dk->dims()[2];
  num_inputs++;
  }
 
- if (dout_v) {
- dev_ctx.template Alloc<T>(dv);
+ if (dv && dv->numel() > 0) {
  outs_data[num_inputs] = dv->data<T>();
  ins_data[num_inputs] = dout_v->data<T>();
  inputs_num_heads[num_inputs] = dv->dims()[2];
  num_inputs++;
  }
-
  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
  MPType div_c = static_cast<MPType>(1.0f / head_dim);
 

diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
@@ -38,8 +38,10 @@ void FusedRopeKernel(const Context& dev_ctx,
  DenseTensor* out_k,
  DenseTensor* out_v) {
  int64_t numel = q.numel();
- if (numel <= 0) return;
  dev_ctx.template Alloc<T>(out_q);
+ if (k) dev_ctx.template Alloc<T>(out_k);
+ if (v) dev_ctx.template Alloc<T>(out_v);
+ if (numel <= 0) return;
 
  phi::Array<int64_t, 3> inputs_num_heads;
 
@@ -73,16 +75,13 @@ void FusedRopeKernel(const Context& dev_ctx,
  outs_data[0] = out_q->data<T>();
  int num_inputs = 1;
 
- if (k) {
- dev_ctx.template Alloc<T>(out_k);
+ if (out_k && out_k->numel() > 0) {
  ins_data[num_inputs] = k->data<T>();
  outs_data[num_inputs] = out_k->data<T>();
  inputs_num_heads[num_inputs] = k->dims()[2];
  num_inputs++;
  }
-
- if (v) {
- dev_ctx.template Alloc<T>(out_v);
+ if (out_v && out_v->numel() > 0) {
  ins_data[num_inputs] = v->data<T>();
  outs_data[num_inputs] = out_v->data<T>();
  inputs_num_heads[num_inputs] = v->dims()[2];

diff --git a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
@@ -927,6 +927,18 @@ void FlashAttnGradKernel(const Context& dev_ctx,
  if (dv) {
  dev_ctx.template Alloc<T>(dv);
  }
+ if (dout.numel() == 0) {
+ if (dq)
+ Full<T, Context>(
+ dev_ctx, phi::IntArray(common::vectorize(dq->dims())), 0, dq);
+ if (dk)
+ Full<T, Context>(
+ dev_ctx, phi::IntArray(common::vectorize(dk->dims())), 0, dk);
+ if (dv)
+ Full<T, Context>(
+ dev_ctx, phi::IntArray(common::vectorize(dv->dims())), 0, dv);
+ return;
+ }
  FlashAttnGradBaseKernel<T, Context>(dev_ctx,
  q,
  k,

diff --git a/paddle/phi/kernels/gpu/flash_attn_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_kernel.cu
@@ -633,6 +633,31 @@ void FlashAttnKernel(const Context& dev_ctx,
  DenseTensor* softmax,
  DenseTensor* softmax_lse,
  DenseTensor* seed_offset) {
+ if (q.numel() == 0 || k.numel() == 0 || v.numel() == 0) {
+ if (out) {
+ Full<T, Context>(
+ dev_ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
+ }
+ if (softmax) {
+ Full<T, Context>(dev_ctx,
+ phi::IntArray(common::vectorize(softmax->dims())),
+ 0,
+ softmax);
+ }
+ if (softmax_lse) {
+ Full<T, Context>(dev_ctx,
+ phi::IntArray(common::vectorize(softmax_lse->dims())),
+ 0,
+ softmax_lse);
+ }
+ if (seed_offset) {
+ Full<T, Context>(dev_ctx,
+ phi::IntArray(common::vectorize(seed_offset->dims())),
+ 0,
+ seed_offset);
+ }
+ return;
+ }
  FlashAttnBaseKernel<T, Context>(dev_ctx,
  q,
  k,

diff --git a/test/legacy_test/test_fused_rotary_position_embedding.py b/test/legacy_test/test_fused_rotary_position_embedding.py
@@ -692,5 +692,55 @@ def test_error2():
  self.assertRaises(AssertionError, test_error2)
 
 
+@unittest.skipIf(
+ not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+ "core is not compiled with CUDA or ROCM ",
+)
+class TestFusedRotaryPositionEmbeddingZeroSize(unittest.TestCase):
+ def setUp(self):
+ self.dtype = "float32"
+ self.qkv_shape = [0, 1, 8, 8]
+ self.sin_cos_shape = [1, 1, 1, 8]
+
+ def init_data(self):
+ self.q = paddle.randn(self.qkv_shape, dtype=self.dtype)
+ self.k = paddle.randn(self.qkv_shape, dtype=self.dtype)
+ self.v = paddle.randn(self.qkv_shape, dtype=self.dtype)
+ self.q.stop_gradient = False
+ self.k.stop_gradient = False
+ self.v.stop_gradient = False
+ self.sin = paddle.sin(
+ paddle.randn(self.sin_cos_shape, dtype=self.dtype)
+ )
+ self.cos = paddle.cos(
+ paddle.randn(self.sin_cos_shape, dtype=self.dtype)
+ )
+
+ def _test_forward_backward(self):
+ out_q, out_k, out_v = fused_rotary_position_embedding(
+ self.q,
+ self.k,
+ self.v,
+ sin=self.sin,
+ cos=self.cos,
+ use_neox_rotary_style=False,
+ )
+ out = out_q + out_k + out_v
+ out.backward()
+ np.testing.assert_allclose(
+ self.q.shape, self.q.grad.shape, rtol=1e-05, atol=1e-06
+ )
+ np.testing.assert_allclose(
+ self.k.shape, self.k.grad.shape, rtol=1e-05, atol=1e-06
+ )
+ np.testing.assert_allclose(
+ self.v.shape, self.v.grad.shape, rtol=1e-05, atol=1e-06
+ )
+
+ def test_zero_size(self):
+ self.init_data()
+ self._test_forward_backward()
+
+
 if __name__ == "__main__":
  unittest.main()
diff --git a/test/legacy_test/test_scaled_dot_product_attention.py b/test/legacy_test/test_scaled_dot_product_attention.py
@@ -220,5 +220,14 @@ def test_3d_input(self):
  np.testing.assert_allclose(out.numpy(), out_ref, rtol=5e-03, atol=1e-03)
 
 
+class TestAttentionWithBoolMaskZeroSize(TestAttentionWithBoolMask):
+ def setUp(self):
+ self.place = paddle.CUDAPlace(0)
+ self.shape = (0, 1, 8, 8)
+ self.dtype = 'float32'
+ self.dropout = 0.0
+ self.causal = False
+
+
 if __name__ == '__main__':
  unittest.main()