pytorch-tpu
diff --git a/‎src/transformers/models/mixtral/modeling_mixtral.py‎
Lines changed: 0 additions & 3 deletions b/‎src/transformers/models/mixtral/modeling_mixtral.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎src/transformers/models/qwen2_moe/modeling_qwen2_moe.py‎
Lines changed: 0 additions & 3 deletions b/‎src/transformers/models/qwen2_moe/modeling_qwen2_moe.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎src/transformers/utils/fx.py‎
Lines changed: 5 additions & 0 deletions b/‎src/transformers/utils/fx.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/models/mistral/test_modeling_mistral.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/models/mistral/test_modeling_mistral.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/models/mixtral/test_modeling_mixtral.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/models/mixtral/test_modeling_mixtral.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/models/qwen2/test_modeling_qwen2.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/models/qwen2/test_modeling_qwen2.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/models/qwen2_moe/test_modeling_qwen2_moe.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/models/qwen2_moe/test_modeling_qwen2_moe.py‎
Lines changed: 1 addition & 0 deletions
@@ -868,9 +868,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  expert_layer = self.experts[expert_idx]
  idx, top_x = torch.where(expert_mask[expert_idx])
 
- if top_x.shape[0] == 0:
- continue
-
  # Index the correct hidden states and compute the expert hidden state for
  # the current expert. We need to make sure to multiply the output hidden
  # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
 
@@ -840,9 +840,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  expert_layer = self.experts[expert_idx]
  idx, top_x = torch.where(expert_mask[expert_idx])
 
- if top_x.shape[0] == 0:
- continue
-
  # Index the correct hidden states and compute the expert hidden state for
  # the current expert. We need to make sure to multiply the output hidden
  # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
 
@@ -141,12 +141,16 @@ def _generate_supported_model_class_names(
  "marian",
  "mbart",
  "megatron-bert",
+ "mistral",
+ "mixtral",
  "mobilebert",
  "mt5",
  "nezha",
  "opt",
  "pegasus",
  "plbart",
+ "qwen2",
+ "qwen2_moe",
  "resnet",
  "roberta",
  "segformer",
@@ -758,6 +762,7 @@ class HFTracer(Tracer):
  "tensor",
  "clamp",
  "finfo",
+ "tril",
  ]
  supported_archs = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel)
 
 
@@ -303,6 +303,7 @@ class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
  )
  test_headmasking = False
  test_pruning = False
+ fx_compatible = True
 
  # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
  def is_pipeline_test_to_skip(
 
@@ -302,6 +302,7 @@ class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
  )
  test_headmasking = False
  test_pruning = False
+ fx_compatible = True
 
  # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
  def is_pipeline_test_to_skip(
 
@@ -313,6 +313,7 @@ class Qwen2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
  )
  test_headmasking = False
  test_pruning = False
+ fx_compatible = True
 
  # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
  def is_pipeline_test_to_skip(
 
@@ -342,6 +342,7 @@ class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
  )
  test_headmasking = False
  test_pruning = False
+ fx_compatible = True
 
  # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
  def is_pipeline_test_to_skip(
Original file line number	Diff line number	Diff line change
`@@ -303,6 +303,7 @@ class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi`
`303`	`303`	`)`
`304`	`304`	`test_headmasking = False`
`305`	`305`	`test_pruning = False`
	`306`	`+ fx_compatible = True`
`306`	`307`
`307`	`308`	`# TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146`
`308`	`309`	`def is_pipeline_test_to_skip(`
Original file line number	Diff line number	Diff line change
`@@ -302,6 +302,7 @@ class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi`
`302`	`302`	`)`
`303`	`303`	`test_headmasking = False`
`304`	`304`	`test_pruning = False`
	`305`	`+ fx_compatible = True`
`305`	`306`
`306`	`307`	`# TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146`
`307`	`308`	`def is_pipeline_test_to_skip(`
Original file line number	Diff line number	Diff line change
`@@ -313,6 +313,7 @@ class Qwen2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi`
`313`	`313`	`)`
`314`	`314`	`test_headmasking = False`
`315`	`315`	`test_pruning = False`
	`316`	`+ fx_compatible = True`
`316`	`317`
`317`	`318`	`# TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146`
`318`	`319`	`def is_pipeline_test_to_skip(`
Original file line number	Diff line number	Diff line change
`@@ -342,6 +342,7 @@ class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM`
`342`	`342`	`)`
`343`	`343`	`test_headmasking = False`
`344`	`344`	`test_pruning = False`
	`345`	`+ fx_compatible = True`
`345`	`346`
`346`	`347`	`# TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146`
`347`	`348`	`def is_pipeline_test_to_skip(`