neuralmagic
diff --git a/‎vllm/model_executor/layers/fused_moe/all2all_utils.py‎
Lines changed: 8 additions & 2 deletions b/‎vllm/model_executor/layers/fused_moe/all2all_utils.py‎
Lines changed: 8 additions & 2 deletions
@@ -135,15 +135,21 @@ def maybe_make_prepare_finalize(
  block_shape=quant_config.block_shape,
  )
 
+ in_dtype = (
+ quant_config.quant_dtype
+ if quant_config.quant_dtype is not None
+ else moe.in_dtype
+ )
+
  all_to_all_args = dict(
  max_num_tokens=moe.max_num_tokens,
  num_experts=moe.num_experts,
  num_experts_per_token=moe.experts_per_token,
  expert_padding=1, # TODO: tests use 1 or 16
  hidden_dim=moe.hidden_dim,
  hidden_dim_scale=hidden_dim_scale,
- in_dtype=moe.in_dtype,
- out_dtype=moe.in_dtype, # or quant type?
+ in_dtype=in_dtype,
+ out_dtype=in_dtype,
  scale_dtype=torch.float32,
  max_private_tokens=None, # For tuning
  )