vllm-project
diff --git a/‎vllm/model_executor/model_loader/bitsandbytes_loader.py‎
Lines changed: 2 additions & 6 deletions b/‎vllm/model_executor/model_loader/bitsandbytes_loader.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎vllm/model_executor/models/qwen3_moe.py‎
Lines changed: 7 additions & 1 deletion b/‎vllm/model_executor/models/qwen3_moe.py‎
Lines changed: 7 additions & 1 deletion
@@ -427,13 +427,9 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None:
  elif isinstance(module, FusedMoE) and hasattr(
  module.quant_method, "quant_config"):
  # TODO: support FusedMoE with prequant and 8bit.
- if self.pre_quant:
+ if self.pre_quant and self.load_8bit:
  raise ValueError(
- "Prequant BitsAndBytes models with FusedMoE is not "
- "supported yet.")
- if self.load_8bit:
- raise ValueError(
- "BitsAndBytes 8bit quantization with FusedMoE is not "
+ "Prequant BitsAndBytes 8bit models with FusedMoE is not "
  "supported yet.")
  # Get the corresponding weight name using module name and
  # expert_params_mapping.
 
@@ -52,6 +52,7 @@
  default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
 
 from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
@@ -149,8 +150,13 @@ def __init__(
  self.gate = ReplicatedLinear(config.hidden_size,
  config.num_experts,
  bias=False,
- quant_config=None,
+ quant_config=self._maybe_ignore_quant_config(quant_config), # Some quantization methods do not quantize the gate
  prefix=f"{prefix}.gate")
+ 
+ def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+ if not isinstance(quant_config, (BitsAndBytesConfig)):
+ return None
+ return quant_config
 
  def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  # NOTE: hidden_states can have either 1D or 2D shape.