huggingface
diff --git a/‎src/transformers/generation/continuous_batching/continuous_api.py‎
Lines changed: 52 additions & 3 deletions b/‎src/transformers/generation/continuous_batching/continuous_api.py‎
Lines changed: 52 additions & 3 deletions
diff --git a/‎tests/generation/test_continuous_batching.py‎
Lines changed: 43 additions & 0 deletions b/‎tests/generation/test_continuous_batching.py‎
Lines changed: 43 additions & 0 deletions
@@ -42,7 +42,56 @@ def build_attention_mask(
 ) -> None:
  """Builds an attention mask inplace using the cumulative seqlens of the query and key. If given a sliding window, it
  will also apply a sliding window mask on top. The attention mask is not boolean, it uses zeroes and -inf (or its
- equivalent) so it's more of an attention score bias tensor."""
+ equivalent) so it's more of an attention score bias tensor.
+ The attention mask is a block-diagonal matrix, with each block an attention mask for a single query-key pair.
+ Each of those block is built from a causal mask and, if there is a sliding window, a sliding window mask.
+
+ An example is represented below, with seqlen_k = 8, seqlen_q = 4 and sliding_window = 6:
+
+ CAUSAL MASK:
+
+ █ █ █ █ █ ░ ░ ░
+ █ █ █ █ █ █ ░ ░
+ █ █ █ █ █ █ █ ░
+ █ █ █ █ █ █ █ █
+
+ SLIDING WINDOW MASK:
+ ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 8 - 4 - 6 = -2 offset to the right
+ <─┴─>
+ ░ █ | █ █ █ █ █ █ █ █
+ ░ ░ | █ █ █ █ █ █ █ █
+ ░ ░ | ░ █ █ █ █ █ █ █
+ ░ ░ | ░ ░ █ █ █ █ █ █
+
+ ATTENTION MASK (sum of causal and sliding window masks):
+
+ █ █ █ █ █ ░ ░ ░
+ █ █ █ █ █ █ ░ ░
+ ░ █ █ █ █ █ █ ░
+ ░ ░ █ █ █ █ █ █
+
+ Another example with seqlen_k = 5, seqlen_q = 3 and sliding_window = 2:
+
+ CAUSAL MASK:
+
+ █ █ █ ░ ░
+ █ █ █ █ ░
+ █ █ █ █ █
+
+ SLIDING WINDOW MASK:
+ ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 5 - 3 - 2 = 0 offset to the right
+ <┴>
+ | ░ █ █ █ █
+ | ░ ░ █ █ █
+ | ░ ░ ░ █ █
+
+ ATTENTION MASK (sum of causal and sliding window masks):
+
+ ░ █ █ ░ ░
+ ░ ░ █ █ ░
+ ░ ░ ░ █ █
+
+ """
  min_value = torch.finfo(attention_mask.dtype).min
  for i in range(len(cumulative_seqlens_q) - 1):
  seqlen_q = cumulative_seqlens_q[i + 1] - cumulative_seqlens_q[i]
@@ -63,8 +112,8 @@ def build_attention_mask(
  masked = torch.triu(minus_inf, diagonal=causal_diagonal)
  # Apply sliding window mask if needed
  if sliding_window > 1:
- sliding_diagonal = seqlen_k - seqlen_q + sliding_window
- masked = torch.tril(masked, diagonal=sliding_diagonal)
+ sliding_diagonal = seqlen_k - seqlen_q - sliding_window
+ masked += torch.tril(minus_inf, diagonal=sliding_diagonal)
  # Replace in attention mask
  attention_mask[..., query_range, key_range] = masked
 
 
@@ -20,6 +20,7 @@
 
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from transformers.generation.continuous_batching.cache import group_layers_by_attn_type
+from transformers.generation.continuous_batching.continuous_api import build_attention_mask
 from transformers.testing_utils import Expectations, require_kernels, require_torch_gpu, slow
 
 
@@ -88,6 +89,48 @@ def test_group_layers(
  f"Test failed for: {layer_types_str = }, {sliding_window = }, {group_types = }",
  )
 
+ @parameterized.expand(
+ [
+ ([0, 4], [0, 4], 1, ["1000", "1100", "1110", "1111"]),
+ ([0, 4], [0, 4], 2, ["1000", "1100", "0110", "0011"]),
+ ([0, 3], [0, 5], 1, ["11100", "11110", "11111"]),
+ ([0, 3], [0, 5], 3, ["11100", "01110", "00111"]),
+ ([0, 3, 6], [0, 3, 6], 1, ["100000", "110000", "111000", "000100", "000110", "000111"]),
+ ([0, 3, 6], [0, 3, 6], 2, ["100000", "110000", "011000", "000100", "000110", "000011"]),
+ ]
+ )
+ def test_attention_mask(
+ self,
+ cumulative_seqlens_q: list[int],
+ cumulative_seqlens_k: list[int],
+ sliding_window: int, # the sliding window size, 1 means no sliding window
+ str_expected_mask: list[str], # the attention mask, broken down by line as a string of 0s and 1s
+ ) -> None:
+ # Build expected mask
+ minus_inf = torch.finfo(torch.float32).min
+ expected_mask = torch.empty((cumulative_seqlens_q[-1], cumulative_seqlens_k[-1]), dtype=torch.float32)
+ for i, line in enumerate(str_expected_mask):
+ expected_mask[i, :] = torch.tensor([minus_inf if c == "0" else 0 for c in line])
+ # Build actual mask
+ actual_mask = torch.full_like(expected_mask, minus_inf) # function modifies in place
+ build_attention_mask(
+ actual_mask, torch.tensor(cumulative_seqlens_q), torch.tensor(cumulative_seqlens_k), sliding_window
+ )
+ # Check that the actual mask matches the expected mask
+ matches = (expected_mask == actual_mask).all()
+ # If it doesn't match, print the masks in a readable form and fail the test
+ if not matches:
+ str_mask = [
+ "".join("1" if x == 0 else "0" for x in token_attn_vector) for token_attn_vector in actual_mask
+ ]
+ str_mask = "\n".join(str_mask)
+ str_expected_mask = "\n".join(str_expected_mask)
+ self.fail(
+ f"Test failed for: {cumulative_seqlens_q = }, {cumulative_seqlens_k = }, {sliding_window = }\n"
+ f"Expected mask:\n{str_expected_mask}\n"
+ f"Actual mask:\n{str_mask}"
+ )
+
  def _continuous_batching_parity(
  self, model_id: str, attn_implementation: str, expected_outputs: dict[str, str]
  ) -> None: