Skip to content

Commit 3410ba9

Browse files
authored
Gemma3 hybrid fix (#42287)
* Fix gemma3 on H100 * Partial fixes for Mi325 * First half of A10 fix * Final A10 fix
1 parent f7e964e commit 3410ba9

File tree

1 file changed

+16
-30
lines changed

1 file changed

+16
-30
lines changed

tests/models/gemma3/test_modeling_gemma3.py

Lines changed: 16 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,6 @@ def test_generation_beyond_sliding_window_tiny_model(self):
134134
max_new_tokens=1,
135135
do_sample=False,
136136
use_cache=True,
137-
cache_implementation="hybrid",
138137
disable_compile=True,
139138
)
140139
# 2 generations are needed to trigger https://github.com/huggingface/transformers/issues/39711
@@ -145,11 +144,9 @@ def test_generation_beyond_sliding_window_tiny_model(self):
145144
max_new_tokens=5,
146145
do_sample=False,
147146
use_cache=True,
148-
cache_implementation="hybrid",
149147
disable_compile=True,
150148
)
151149
generated_sequences = output[:, input_len:].cpu()
152-
print(generated_sequences)
153150
EXPECTED_OUTPUT = torch.tensor([[90109, 90109, 90109, 83191, 83191], [246901, 69832, 69832, 69832, 62288]])
154151
torch.testing.assert_close(generated_sequences, EXPECTED_OUTPUT)
155152

@@ -493,16 +490,15 @@ def test_model_4b_bf16(self):
493490
add_generation_prompt=True,
494491
).to(torch_device)
495492

496-
# cache_implementation="hybrid" an in the original transformers implementation
497-
output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
493+
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
498494
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
499495

500496
EXPECTED_TEXTS = Expectations(
501497
{
502498
("xpu", 3): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with turquoise water and a blue sky in the background. It looks like a'],
503499
("cuda", (8, 0)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like'],
504500
("cuda", (8, 6)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like'],
505-
("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with turquoise water and a blue sky in the background. It looks like a'],
501+
("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like'],
506502
("rocm", (9, 5)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant coastline in the background. It looks'],
507503
}
508504
) # fmt: skip
@@ -543,8 +539,7 @@ def test_model_4b_batch(self):
543539
add_generation_prompt=True,
544540
).to(torch_device)
545541

546-
# cache_implementation="hybrid" an in the original transformers implementation
547-
output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
542+
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
548543
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
549544

550545
EXPECTED_TEXTS = Expectations(
@@ -561,12 +556,12 @@ def test_model_4b_batch(self):
561556
],
562557
("cuda", (8,6)):
563558
[
564-
'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like',
559+
'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like',
565560
"user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Image 1:** Shows a brown"
566561
],
567562
("rocm", (9, 4)):
568563
[
569-
'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like',
564+
'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with turquoise water and a blue sky in the background. It looks like a',
570565
"user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Image 1:** Shows a cow"
571566
],
572567
("rocm", (9, 5)):
@@ -603,24 +598,22 @@ def test_model_4b_crops(self):
603598
**crop_config,
604599
).to(torch_device)
605600

606-
# cache_implementation="hybrid" an in the original transformers implementation
607-
output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
601+
output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="static")
608602
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
609603

610604
EXPECTED_NUM_IMAGES = 3 # one for the origin image and two crops of images
611605
EXPECTED_TEXTS = Expectations(
612606
{
613607
("xpu", 3): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
614-
("cuda", 7): [],
615-
("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a clear blue sky with some white clouds above."],
616608
("cuda", (8, 0)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background"],
609+
("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
610+
("cuda", (9, 0)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
617611
("rocm", (9, 4)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
618612
("rocm", (9, 5)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background"]
619613
}
620614
) # fmt: skip
621615
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
622616
self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
623-
print(f"Generated text: {output_text}")
624617
self.assertEqual(output_text, EXPECTED_TEXT)
625618

626619
@require_torch_large_accelerator
@@ -665,8 +658,7 @@ def test_model_4b_batch_crops(self):
665658
**crop_config,
666659
).to(torch_device)
667660

668-
# cache_implementation="hybrid" an in the original transformers implementation
669-
output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
661+
output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="static")
670662
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
671663
EXPECTED_NUM_IMAGES = 9 # 3 * (one for the origin image and two crops of images) = 9
672664
EXPECTED_TEXTS = Expectations(
@@ -726,15 +718,14 @@ def test_model_4b_multiimage(self):
726718
add_generation_prompt=True,
727719
).to(torch_device)
728720

729-
# cache_implementation="hybrid" an in the original transformers implementation
730-
output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
721+
output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="static")
731722
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
732723
EXPECTED_TEXTS = Expectations(
733724
{
734725
("xpu", 3): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt looks like a street scene in a city with"],
735-
("cuda", 7): [],
736726
("cuda", (8, 0)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt looks like a street scene in a vibrant,"],
737727
("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt appears to be a street scene in a city"],
728+
("cuda", (9, 0)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image!\n\nHere's a description of the scene:\n\n* **Location:**"],
738729
("rocm", (9, 4)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt appears to be a street scene in a vibrant"],
739730
("rocm", (9, 5)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Main Features:**\n\n* **Chinese Archway:** The most prominent"],
740731
}
@@ -750,8 +741,7 @@ def test_model_1b_text_only(self):
750741
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
751742
inputs = tokenizer("Write a poem about Machine Learning.", return_tensors="pt").to(torch_device)
752743

753-
# cache_implementation="hybrid" an in the original transformers implementation
754-
output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
744+
output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="static")
755745
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
756746

757747
EXPECTED_TEXTS = Expectations(
@@ -785,8 +775,7 @@ def test_model_4b_flash_attn(self):
785775
add_generation_prompt=True,
786776
).to(torch_device)
787777

788-
# cache_implementation="hybrid" an in the original transformers implementation
789-
output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
778+
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
790779
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
791780

792781
EXPECTED_TEXTS = Expectations(
@@ -827,9 +816,7 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
827816
input_size = inputs.input_ids.shape[-1]
828817
self.assertTrue(input_size > model.config.sliding_window)
829818

830-
out = model.generate(**inputs, max_new_tokens=20, do_sample=False, cache_implementation="static")[
831-
:, input_size:
832-
]
819+
out = model.generate(**inputs, max_new_tokens=20, do_sample=False)[:, input_size:]
833820
output_text = tokenizer.batch_decode(out)
834821

835822
EXPECTED_COMPLETIONS = [
@@ -839,7 +826,7 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
839826
self.assertEqual(output_text, EXPECTED_COMPLETIONS)
840827

841828
@pytest.mark.torch_export_test
842-
def test_export_text_only_with_hybrid_cache(self):
829+
def test_export_text_only(self):
843830
if not is_torch_greater_or_equal("2.6.0"):
844831
self.skipTest(reason="This test requires torch >= 2.6 to run.")
845832

@@ -849,7 +836,7 @@ def test_export_text_only_with_hybrid_cache(self):
849836
model = AutoModelForCausalLM.from_pretrained(model_id)
850837
self.assertEqual(model.config.cache_implementation, "hybrid")
851838

852-
# Export + hybrid cache
839+
# Export
853840
model.eval()
854841
exportable_module = TorchExportableModuleForDecoderOnlyLM(model, batch_size=1, max_cache_len=1024)
855842
exported_program = exportable_module.export(
@@ -874,7 +861,6 @@ def test_export_text_only_with_hybrid_cache(self):
874861
**input_text,
875862
max_new_tokens=max_new_tokens_to_generate,
876863
do_sample=False, # Use greedy decoding to match the exported model
877-
cache_implementation="hybrid",
878864
)
879865

880866
eager_generated_text = tokenizer.decode(eager_outputs[0], skip_special_tokens=True)

0 commit comments

Comments
 (0)