EmbeddedLLM
diff --git a/‎vllm/model_executor/models/interfaces.py‎
Lines changed: 1 addition & 1 deletion b/‎vllm/model_executor/models/interfaces.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm/model_executor/models/qwen2_moe.py‎
Lines changed: 13 additions & 5 deletions b/‎vllm/model_executor/models/qwen2_moe.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎vllm/model_executor/models/qwen3_moe.py‎
Lines changed: 9 additions & 5 deletions b/‎vllm/model_executor/models/qwen3_moe.py‎
Lines changed: 9 additions & 5 deletions
@@ -325,7 +325,7 @@ class SupportsLoRA(Protocol):
  # are empty by default.
  embedding_modules: ClassVar[dict[str, str]] = {}
  embedding_padding_modules: ClassVar[list[str]] = []
- packed_modules_mapping: ClassVar[dict[str, list[str]]] = {}
+ packed_modules_mapping: dict[str, list[str]] = {}
 
 
 # We can't use runtime_checkable with ClassVar for issubclass checks
 
@@ -534,11 +534,7 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
  "q_proj",
  "k_proj",
  "v_proj",
- ],
- "gate_up_proj": [
- "gate_proj",
- "up_proj",
- ],
+ ]
  }
 
  def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -547,6 +543,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
  quant_config = vllm_config.quant_config
  self.config = config
  self.quant_config = quant_config
+ # Only perform the following mapping when Qwen2MoeMLP exists
+ if (
+ getattr(config, "mlp_only_layers", [])
+ or config.shared_expert_intermediate_size > 0
+ ):
+ self.packed_modules_mapping["gate_up_proj"] = (
+ [
+ "gate_proj",
+ "up_proj",
+ ],
+ )
+
  self.model = Qwen2MoeModel(
  vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
  )
 
@@ -634,11 +634,7 @@ class Qwen3MoeForCausalLM(
  "q_proj",
  "k_proj",
  "v_proj",
- ],
- "gate_up_proj": [
- "gate_proj",
- "up_proj",
- ],
+ ]
  }
 
  fall_back_to_pt_during_load = False
@@ -649,6 +645,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
  quant_config = vllm_config.quant_config
  self.config = config
  self.quant_config = quant_config
+ # Only perform the following mapping when Qwen3MoeMLP exists
+ if getattr(config, "mlp_only_layers", []):
+ self.packed_modules_mapping["gate_up_proj"] = (
+ [
+ "gate_proj",
+ "up_proj",
+ ],
+ )
  self.model = Qwen3MoeModel(
  vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
  )