huggingface
diff --git a/‎tests/models/gemma3/test_modeling_gemma3.py‎
Lines changed: 16 additions & 30 deletions b/‎tests/models/gemma3/test_modeling_gemma3.py‎
Lines changed: 16 additions & 30 deletions
@@ -134,7 +134,6 @@ def test_generation_beyond_sliding_window_tiny_model(self):
  max_new_tokens=1,
  do_sample=False,
  use_cache=True,
- cache_implementation="hybrid",
  disable_compile=True,
  )
  # 2 generations are needed to trigger https://github.com/huggingface/transformers/issues/39711
@@ -145,11 +144,9 @@ def test_generation_beyond_sliding_window_tiny_model(self):
  max_new_tokens=5,
  do_sample=False,
  use_cache=True,
- cache_implementation="hybrid",
  disable_compile=True,
  )
  generated_sequences = output[:, input_len:].cpu()
- print(generated_sequences)
  EXPECTED_OUTPUT = torch.tensor([[90109, 90109, 90109, 83191, 83191], [246901, 69832, 69832, 69832, 62288]])
  torch.testing.assert_close(generated_sequences, EXPECTED_OUTPUT)
 
@@ -493,16 +490,15 @@ def test_model_4b_bf16(self):
  add_generation_prompt=True,
  ).to(torch_device)
 
- # cache_implementation="hybrid" an in the original transformers implementation
- output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+ output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
  output_text = self.processor.batch_decode(output, skip_special_tokens=True)
 
  EXPECTED_TEXTS = Expectations(
  {
  ("xpu", 3): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with turquoise water and a blue sky in the background. It looks like a'],
  ("cuda", (8, 0)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like'],
  ("cuda", (8, 6)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like'],
- ("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with turquoise water and a blue sky in the background. It looks like a'],
+ ("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like'],
  ("rocm", (9, 5)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant coastline in the background. It looks'],
  }
  ) # fmt: skip
@@ -543,8 +539,7 @@ def test_model_4b_batch(self):
  add_generation_prompt=True,
  ).to(torch_device)
 
- # cache_implementation="hybrid" an in the original transformers implementation
- output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+ output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
  output_text = self.processor.batch_decode(output, skip_special_tokens=True)
 
  EXPECTED_TEXTS = Expectations(
@@ -561,12 +556,12 @@ def test_model_4b_batch(self):
  ],
  ("cuda", (8,6)):
  [
- 'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like',
+ 'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like',
  "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Image 1:** Shows a brown"
  ],
  ("rocm", (9, 4)):
  [
- 'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like',
+ 'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with turquoise water and a blue sky in the background. It looks like a',
  "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Image 1:** Shows a cow"
  ],
  ("rocm", (9, 5)):
@@ -603,24 +598,22 @@ def test_model_4b_crops(self):
  **crop_config,
  ).to(torch_device)
 
- # cache_implementation="hybrid" an in the original transformers implementation
- output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+ output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="static")
  output_text = self.processor.batch_decode(output, skip_special_tokens=True)
 
  EXPECTED_NUM_IMAGES = 3 # one for the origin image and two crops of images
  EXPECTED_TEXTS = Expectations(
  {
  ("xpu", 3): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
- ("cuda", 7): [],
- ("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a clear blue sky with some white clouds above."],
  ("cuda", (8, 0)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background"],
+ ("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
+ ("cuda", (9, 0)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
  ("rocm", (9, 4)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
  ("rocm", (9, 5)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background"]
  }
  ) # fmt: skip
  EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
  self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
- print(f"Generated text: {output_text}")
  self.assertEqual(output_text, EXPECTED_TEXT)
 
  @require_torch_large_accelerator
@@ -665,8 +658,7 @@ def test_model_4b_batch_crops(self):
  **crop_config,
  ).to(torch_device)
 
- # cache_implementation="hybrid" an in the original transformers implementation
- output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+ output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="static")
  output_text = self.processor.batch_decode(output, skip_special_tokens=True)
  EXPECTED_NUM_IMAGES = 9 # 3 * (one for the origin image and two crops of images) = 9
  EXPECTED_TEXTS = Expectations(
@@ -726,15 +718,14 @@ def test_model_4b_multiimage(self):
  add_generation_prompt=True,
  ).to(torch_device)
 
- # cache_implementation="hybrid" an in the original transformers implementation
- output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+ output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="static")
  output_text = self.processor.batch_decode(output, skip_special_tokens=True)
  EXPECTED_TEXTS = Expectations(
  {
  ("xpu", 3): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt looks like a street scene in a city with"],
- ("cuda", 7): [],
  ("cuda", (8, 0)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt looks like a street scene in a vibrant,"],
  ("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt appears to be a street scene in a city"],
+ ("cuda", (9, 0)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image!\n\nHere's a description of the scene:\n\n* **Location:**"],
  ("rocm", (9, 4)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt appears to be a street scene in a vibrant"],
  ("rocm", (9, 5)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Main Features:**\n\n* **Chinese Archway:** The most prominent"],
  }
@@ -750,8 +741,7 @@ def test_model_1b_text_only(self):
  tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
  inputs = tokenizer("Write a poem about Machine Learning.", return_tensors="pt").to(torch_device)
 
- # cache_implementation="hybrid" an in the original transformers implementation
- output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+ output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="static")
  output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
 
  EXPECTED_TEXTS = Expectations(
@@ -785,8 +775,7 @@ def test_model_4b_flash_attn(self):
  add_generation_prompt=True,
  ).to(torch_device)
 
- # cache_implementation="hybrid" an in the original transformers implementation
- output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+ output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
  output_text = self.processor.batch_decode(output, skip_special_tokens=True)
 
  EXPECTED_TEXTS = Expectations(
@@ -827,9 +816,7 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
  input_size = inputs.input_ids.shape[-1]
  self.assertTrue(input_size > model.config.sliding_window)
 
- out = model.generate(**inputs, max_new_tokens=20, do_sample=False, cache_implementation="static")[
- :, input_size:
- ]
+ out = model.generate(**inputs, max_new_tokens=20, do_sample=False)[:, input_size:]
  output_text = tokenizer.batch_decode(out)
 
  EXPECTED_COMPLETIONS = [
@@ -839,7 +826,7 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
  self.assertEqual(output_text, EXPECTED_COMPLETIONS)
 
  @pytest.mark.torch_export_test
- def test_export_text_only_with_hybrid_cache(self):
+ def test_export_text_only(self):
  if not is_torch_greater_or_equal("2.6.0"):
  self.skipTest(reason="This test requires torch >= 2.6 to run.")
 
@@ -849,7 +836,7 @@ def test_export_text_only_with_hybrid_cache(self):
  model = AutoModelForCausalLM.from_pretrained(model_id)
  self.assertEqual(model.config.cache_implementation, "hybrid")
 
- # Export + hybrid cache
+ # Export
  model.eval()
  exportable_module = TorchExportableModuleForDecoderOnlyLM(model, batch_size=1, max_cache_len=1024)
  exported_program = exportable_module.export(
@@ -874,7 +861,6 @@ def test_export_text_only_with_hybrid_cache(self):
  **input_text,
  max_new_tokens=max_new_tokens_to_generate,
  do_sample=False, # Use greedy decoding to match the exported model
- cache_implementation="hybrid",
  )
 
  eager_generated_text = tokenizer.decode(eager_outputs[0], skip_special_tokens=True)