huggingface
diff --git a/‎src/transformers/models/gemma3/modeling_gemma3.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/gemma3/modeling_gemma3.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/gemma3/modular_gemma3.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/gemma3/modular_gemma3.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/gemma3/test_modeling_gemma3.py‎
Lines changed: 17 additions & 0 deletions b/‎tests/models/gemma3/test_modeling_gemma3.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎tests/test_modeling_common.py‎
Lines changed: 10 additions & 5 deletions b/‎tests/test_modeling_common.py‎
Lines changed: 10 additions & 5 deletions
@@ -798,7 +798,7 @@ def create_causal_mask_mapping(
  is_previous_image = nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
  new_image_start = is_image & ~is_previous_image
  image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
- image_group_ids = torch.where(is_image, image_group_ids, torch.full_like(token_type_ids, -1))
+ image_group_ids = torch.where(is_image, image_group_ids, -1)
  mask_kwargs["or_mask_function"] = token_type_ids_mask_function(
  token_type_ids.to(cache_position.device), image_group_ids
  )
 
@@ -764,7 +764,7 @@ def create_causal_mask_mapping(
  is_previous_image = nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
  new_image_start = is_image & ~is_previous_image
  image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
- image_group_ids = torch.where(is_image, image_group_ids, torch.full_like(token_type_ids, -1))
+ image_group_ids = torch.where(is_image, image_group_ids, -1)
  mask_kwargs["or_mask_function"] = token_type_ids_mask_function(
  token_type_ids.to(cache_position.device), image_group_ids
  )
 
@@ -19,6 +19,7 @@
 
 import pytest
 from parameterized import parameterized
+from pytest import mark
 
 from transformers import (
  AutoModelForCausalLM,
@@ -33,9 +34,11 @@
  is_flash_attn_2_available,
  require_deterministic_for_xpu,
  require_flash_attn,
+ require_flash_attn_3,
  require_read_token,
  require_torch,
  require_torch_accelerator,
+ require_torch_gpu,
  require_torch_large_accelerator,
  slow,
  torch_device,
@@ -342,6 +345,20 @@ def test_automodelforcausallm(self):
  for_causal_lm = AutoModelForCausalLM.from_pretrained(tmp_dir)
  self.assertIsInstance(for_causal_lm, Gemma3ForConditionalGeneration)
 
+ @require_flash_attn
+ @require_torch_gpu
+ @mark.flash_attn_test
+ @slow
+ def test_flash_attn_2_from_config(self):
+ self.flash_attn_from_config(attn_implementation="flash_attention_2", test_fwd_in_train=False)
+
+ @require_flash_attn_3
+ @require_torch_gpu
+ @mark.flash_attn_3_test
+ @slow
+ def test_flash_attn_3_from_config(self):
+ self.flash_attn_from_config(attn_implementation="flash_attention_3", test_fwd_in_train=False)
+
 
 @slow
 @require_torch_accelerator
 
@@ -2976,7 +2976,7 @@ def test_model_is_small(self):
 
  def flash_attn_inference_equivalence(
  self, attn_implementation: str, padding_side: str, atol: float = 4e-2, rtol: float = 4e-2
- ):
+ ) -> None:
  r"""
  Tests the equivalence between the eager and flash attention implementations.
  This test is only for inference and runs with `dtype=torch.bfloat16`.
@@ -3114,9 +3114,6 @@ def flash_attn_inference_equivalence(
  torch.testing.assert_close(logits_1_eager, logits_1_fa, atol=atol, rtol=rtol)
  if padding_side == "left":
  torch.testing.assert_close(logits_2_eager[1:], logits_2_fa[1:], atol=atol, rtol=rtol)
- # Check it can run in training mode
- model.train()
- _ = model(**second_inputs)
  else:
  torch.testing.assert_close(logits_2_eager[:-1], logits_2_fa[:-1], atol=atol, rtol=rtol)
 
@@ -3651,7 +3648,7 @@ def test_flash_attn_2_can_compile_with_attention_mask_None_without_graph_break(s
 
  assert not loss.isnan().any()
 
- def flash_attn_from_config(self, attn_implementation: str):
+ def flash_attn_from_config(self, attn_implementation: str, test_fwd_in_train: bool = True):
  r"""
  Tests if the model can be loaded with `attn_implementation` from the config and if the
  weights are not randomly initialized.
@@ -3669,6 +3666,14 @@ def flash_attn_from_config(self, attn_implementation: str):
  config, attn_implementation=attn_implementation, dtype=torch.bfloat16
  ).to(torch_device)
 
+ # By default, we perform the forward pass in train mode, because it's more sctrict than eval mode. If the
+ # forward pass is successful in train mode, it will also be successful in eval mode. But since some models
+ # (eg. gemma3) need different inputs in train mode we have the option to test the forward pass in eval mode.
+ if test_fwd_in_train:
+ fa_model = fa_model.train()
+ else:
+ fa_model = fa_model.eval()
+
  dummy_input = inputs_dict[fa_model.main_input_name]
  if dummy_input.dtype in [torch.float32, torch.float16]:
  dummy_input = dummy_input.to(torch.bfloat16)
Original file line number	Diff line number	Diff line change
`@@ -798,7 +798,7 @@ def create_causal_mask_mapping(`
`798`	`798`	`is_previous_image = nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]`
`799`	`799`	`new_image_start = is_image & ~is_previous_image`
`800`	`800`	`image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1`
`801`		`- image_group_ids = torch.where(is_image, image_group_ids, torch.full_like(token_type_ids, -1))`
	`801`	`+ image_group_ids = torch.where(is_image, image_group_ids, -1)`
`802`	`802`	`mask_kwargs["or_mask_function"] = token_type_ids_mask_function(`
`803`	`803`	`token_type_ids.to(cache_position.device), image_group_ids`
`804`	`804`	`)`
Original file line number	Diff line number	Diff line change
`@@ -764,7 +764,7 @@ def create_causal_mask_mapping(`
`764`	`764`	`is_previous_image = nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]`
`765`	`765`	`new_image_start = is_image & ~is_previous_image`
`766`	`766`	`image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1`
`767`		`- image_group_ids = torch.where(is_image, image_group_ids, torch.full_like(token_type_ids, -1))`
	`767`	`+ image_group_ids = torch.where(is_image, image_group_ids, -1)`
`768`	`768`	`mask_kwargs["or_mask_function"] = token_type_ids_mask_function(`
`769`	`769`	`token_type_ids.to(cache_position.device), image_group_ids`
`770`	`770`	`)`