huggingface
diff --git a/‎src/transformers/integrations/fbgemm_fp8.py‎
Lines changed: 88 additions & 38 deletions b/‎src/transformers/integrations/fbgemm_fp8.py‎
Lines changed: 88 additions & 38 deletions
diff --git a/‎src/transformers/quantizers/quantizer_fbgemm_fp8.py‎
Lines changed: 24 additions & 13 deletions b/‎src/transformers/quantizers/quantizer_fbgemm_fp8.py‎
Lines changed: 24 additions & 13 deletions
diff --git a/‎tests/quantization/fbgemm_fp8/test_fbgemm_fp8.py‎
Lines changed: 23 additions & 12 deletions b/‎tests/quantization/fbgemm_fp8/test_fbgemm_fp8.py‎
Lines changed: 23 additions & 12 deletions
@@ -12,12 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from functools import lru_cache
 from typing import Optional
 
 from ..activations import ACT2FN
 from ..core_model_loading import ConversionOps
 from ..quantizers.quantizers_utils import get_module_from_name, should_convert_module
-from ..utils import is_accelerate_available, is_fbgemm_gpu_available, is_torch_available, logging
+from ..utils import (
+ is_accelerate_available,
+ is_fbgemm_gpu_available,
+ is_torch_available,
+ is_torch_xpu_available,
+ logging,
+)
 
 
 if is_torch_available():
@@ -27,7 +34,9 @@
 if is_accelerate_available():
  from accelerate import init_empty_weights
 
-if is_fbgemm_gpu_available():
+_is_torch_xpu_available = is_torch_xpu_available()
+
+if is_fbgemm_gpu_available() and not _is_torch_xpu_available:
  import fbgemm_gpu.experimental.gen_ai # noqa: F401
 
 logger = logging.get_logger(__name__)
@@ -61,7 +70,7 @@ def convert(
  flattened_param = transposed_param.reshape(-1, original_shape[-1])
 
  # Quantize using per row instead of per column
- new_value_flat, weight_scale_flat = torch.ops.fbgemm.quantize_fp8_per_row(flattened_param)
+ new_value_flat, weight_scale_flat = quantize_fp8_per_row(flattened_param)
 
  # Reshape back to original dimensions
  new_value = new_value_flat.reshape(original_shape)
@@ -77,14 +86,14 @@ def convert(
  flattened_param = transposed_param.reshape(-1, original_shape[-1])
 
  # Quantize using per column
- new_value_flat, weight_scale_flat = torch.ops.fbgemm.quantize_fp8_per_row(flattened_param)
+ new_value_flat, weight_scale_flat = quantize_fp8_per_row(flattened_param)
 
  # Reshape back to original dimensions
  new_value = new_value_flat.reshape(original_shape)
  new_value = new_value.transpose(1, 2)
  weight_scale = weight_scale_flat.reshape(original_shape[0], original_shape[1], 1)
  else:
- new_value, weight_scale = torch.ops.fbgemm.quantize_fp8_per_row(value)
+ new_value, weight_scale = quantize_fp8_per_row(value)
  weight_scale = torch.nn.Parameter(weight_scale.view(weight_scale.shape[0], 1))
 
  return {target_key: torch.nn.Parameter(new_value), f"{target_key}_scale": weight_scale}
@@ -110,18 +119,26 @@ def forward(self, x):
  output_shape = (*x.shape[:-1], -1)
  # x_quantized and x_scale are not necessarily on the same device as x, this is an issue.
  # https://github.com/pytorch/FBGEMM/blob/e08af8539c391437f447173863df0f3f6f6f1855/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu#L1237C3-L1237C45
- x_quantized, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(
- x.view(-1, x.shape[-1]).contiguous(), scale_ub=self.input_scale_ub
- )
+ x_quantized, x_scale = quantize_fp8_per_row(x.view(-1, x.shape[-1]).contiguous(), scale_ub=self.input_scale_ub)
  # moving x_quantized, x_scale here creates glibberish output ... However, if we move the output, it works
  # x_quantized, x_scale = x_quantized.to(x.device), x_scale.to(x.device)
 
  # The computation still happens on the device where self.weight is even if x_quantized is not on the same device as self.weight
  weight_scale_float32 = self.weight_scale.to(torch.float32)
- output = torch.ops.fbgemm.f8f8bf16_rowwise(
- x_quantized, self.weight, x_scale, weight_scale_float32, use_fast_accum=True
- )
- output = output + self.bias if self.bias is not None else output
+ if _is_torch_xpu_available:
+ output = torch._scaled_mm(
+ x_quantized,
+ self.weight.t(),
+ scale_a=x_scale.unsqueeze(-1),
+ scale_b=weight_scale_float32.t(),
+ out_dtype=x.dtype,
+ bias=self.bias,
+ )
+ else:
+ output = torch.ops.fbgemm.f8f8bf16_rowwise(
+ x_quantized, self.weight, x_scale, weight_scale_float32, use_fast_accum=True
+ )
+ output = output + self.bias if self.bias is not None else output
  # Hacky for now, we have the output to the device of x
  output = output.to(x.device)
  output = output.reshape(output_shape)
@@ -173,48 +190,79 @@ def forward(self, hidden_states):
  expert_hidden = hidden_states[i]
  expert_hidden_reshaped = expert_hidden.reshape(-1, self.hidden_size)
  # Quantize for this expert
- expert_quantized, expert_scale = torch.ops.fbgemm.quantize_fp8_per_row(
+ expert_quantized, expert_scale = quantize_fp8_per_row(
  expert_hidden_reshaped, num_tokens, self.input_scale_ub
  )
  sharded_expert_dim = self.gate_up_proj.shape[-1] // 2
  gate_up_proj_scale_float32 = self.gate_up_proj_scale.to(torch.float32)
+ if _is_torch_xpu_available:
+ gate = torch._scaled_mm(
+ expert_quantized,
+ self.gate_up_proj[i].transpose(0, 1)[:sharded_expert_dim].contiguous().t(),
+ scale_a=expert_scale.unsqueeze(-1),
+ scale_b=gate_up_proj_scale_float32[i][0][:sharded_expert_dim].view(-1, 1).contiguous().t(),
+ out_dtype=hidden_states.dtype,
+ )
+ up = torch._scaled_mm(
+ expert_quantized,
+ self.gate_up_proj[i].transpose(0, 1)[sharded_expert_dim:].contiguous().t(),
+ scale_a=expert_scale.unsqueeze(-1),
+ scale_b=gate_up_proj_scale_float32[i][0][sharded_expert_dim:].view(-1, 1).contiguous().t(),
+ out_dtype=hidden_states.dtype,
+ )
+ else:
+ gate = torch.ops.fbgemm.f8f8bf16_rowwise(
+ expert_quantized,
+ self.gate_up_proj[i].transpose(0, 1)[:sharded_expert_dim].contiguous(),
+ expert_scale,
+ gate_up_proj_scale_float32[i][0][:sharded_expert_dim].view(-1, 1).contiguous(),
+ use_fast_accum=True,
+ )
 
- gate = torch.ops.fbgemm.f8f8bf16_rowwise(
- expert_quantized,
- self.gate_up_proj[i].transpose(0, 1)[:sharded_expert_dim].contiguous(),
- expert_scale,
- gate_up_proj_scale_float32[i][0][:sharded_expert_dim].view(-1, 1).contiguous(),
- use_fast_accum=True,
- )
-
- up = torch.ops.fbgemm.f8f8bf16_rowwise(
- expert_quantized,
- self.gate_up_proj[i].transpose(0, 1)[sharded_expert_dim:].contiguous(),
- expert_scale,
- gate_up_proj_scale_float32[i][0][sharded_expert_dim:].view(-1, 1).contiguous(),
- use_fast_accum=True,
- )
+ up = torch.ops.fbgemm.f8f8bf16_rowwise(
+ expert_quantized,
+ self.gate_up_proj[i].transpose(0, 1)[sharded_expert_dim:].contiguous(),
+ expert_scale,
+ gate_up_proj_scale_float32[i][0][sharded_expert_dim:].view(-1, 1).contiguous(),
+ use_fast_accum=True,
+ )
 
  activated = up * self.act_fn(gate)
 
- activated_quantized, activated_scale = torch.ops.fbgemm.quantize_fp8_per_row(
- activated, num_tokens, self.input_scale_ub
- )
+ activated_quantized, activated_scale = quantize_fp8_per_row(activated, num_tokens, self.input_scale_ub)
 
  down_proj_scale_float32 = self.down_proj_scale.to(torch.float32)
- expert_output = torch.ops.fbgemm.f8f8bf16_rowwise(
- activated_quantized,
- self.down_proj[i].transpose(0, 1).contiguous(),
- activated_scale,
- down_proj_scale_float32[i].view(-1, 1).contiguous(),
- use_fast_accum=True,
- )
+ if _is_torch_xpu_available:
+ expert_output = torch._scaled_mm(
+ activated_quantized,
+ self.down_proj[i].transpose(0, 1).contiguous(),
+ scale_a=activated_scale.unsqueeze(-1),
+ scale_b=down_proj_scale_float32[i].view(-1, 1).contiguous().t(),
+ out_dtype=hidden_states.dtype,
+ )
+ else:
+ expert_output = torch.ops.fbgemm.f8f8bf16_rowwise(
+ activated_quantized,
+ self.down_proj[i].transpose(0, 1).contiguous(),
+ activated_scale,
+ down_proj_scale_float32[i].view(-1, 1).contiguous(),
+ use_fast_accum=True,
+ )
 
  next_states[i] = expert_output
  next_states = next_states.to(hidden_states.device)
  return next_states.view(-1, self.hidden_size)
 
 
+@lru_cache(maxsize=1)
+def get_quantize_fp8_per_row():
+ if _is_torch_xpu_available:
+ from kernels import get_kernel
+
+ return get_kernel("kernels-community/fp8-fbgemm").quantize_fp8_per_row
+ return torch.ops.fbgemm.quantize_fp8_per_row
+
+
 def replace_with_fbgemm_fp8_linear(
  model, modules_to_not_convert: list[str] | None = None, quantization_config=None, pre_quantized=False, tp_plan=None
 ):
@@ -232,6 +280,8 @@ def replace_with_fbgemm_fp8_linear(
  pre_quantized (`book`, defaults to `False`):
  Whether the model is pre-quantized or not
  """
+ global quantize_fp8_per_row
+ quantize_fp8_per_row = get_quantize_fp8_per_row()
 
  has_been_replaced = False
  module_kwargs = {} if pre_quantized else {"dtype": None}
 
@@ -19,14 +19,21 @@
 if TYPE_CHECKING:
  from ..modeling_utils import PreTrainedModel
 
-from ..utils import is_accelerate_available, is_fbgemm_gpu_available, is_torch_available, logging
+from ..utils import (
+ is_accelerate_available,
+ is_fbgemm_gpu_available,
+ is_kernels_available,
+ is_torch_available,
+ is_torch_cuda_available,
+ is_torch_xpu_available,
+ logging,
+)
 from .quantizers_utils import get_module_from_name
 
 
 if is_torch_available():
  import torch
 
-
 logger = logging.get_logger(__name__)
 
 
@@ -41,27 +48,32 @@ def __init__(self, quantization_config, **kwargs):
  super().__init__(quantization_config, **kwargs)
 
  def validate_environment(self, *args, **kwargs):
- if not is_fbgemm_gpu_available():
+ if not is_torch_cuda_available() and not is_torch_xpu_available():
+ raise ImportError("Using fbgemm fp8 quantization requires a GPU or XPU")
+ if is_torch_xpu_available() and not is_kernels_available():
+ raise ImportError("Using FP8 fbgemm on XPU requires kernels (`pip install kernels`)")
+ if is_torch_cuda_available() and not is_fbgemm_gpu_available():
  raise ImportError(
- "Using fbgemm fp8 quantization requires fbgemm-gpu library"
+ "Loading an FP8 fbgemm quantized model on CUDA requires fbgemm-gpu library"
  "Please install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries"
  )
  if not is_accelerate_available():
  raise ImportError(
  "Loading an FP8 quantized model requires accelerate (`pip install --upgrade accelerate`)"
  )
- compute_capability = torch.cuda.get_device_capability()
- major, _ = compute_capability
- if major < 9:
- raise ValueError(
- "FP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)"
- )
+ if is_torch_cuda_available():
+ compute_capability = torch.cuda.get_device_capability()
+ major, _ = compute_capability
+ if major < 9:
+ raise ValueError(
+ "FP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)"
+ )
 
  device_map = kwargs.get("device_map")
  if device_map is None:
  logger.warning_once(
- "You have loaded an FP8 model on CPU and have a CUDA device available, make sure to set "
- "your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. "
+ "You have loaded an FP8 model on CPU and have a CUDA/XPU device available, make sure to set "
+ "your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu' or 'auto'. "
  )
  elif isinstance(device_map, dict):
  if not self.pre_quantized and ("cpu" in device_map.values() or "disk" in device_map.values()):
@@ -121,7 +133,6 @@ def _process_model_before_weight_loading(
  modules_to_not_convert=self.modules_to_not_convert,
  quantization_config=self.quantization_config,
  pre_quantized=self.pre_quantized,
- config=model.config,
  tp_plan=model._tp_plan,
  )
 
 
@@ -21,14 +21,19 @@
 from transformers.testing_utils import (
  backend_empty_cache,
  require_accelerate,
- require_fbgemm_gpu,
+ require_deterministic_for_xpu,
  require_read_token,
- require_torch_gpu,
- require_torch_multi_gpu,
+ require_torch_accelerator,
+ require_torch_multi_accelerator,
  slow,
  torch_device,
 )
-from transformers.utils import is_accelerate_available, is_torch_available
+from transformers.utils import (
+ is_accelerate_available,
+ is_fbgemm_gpu_available,
+ is_torch_available,
+ is_torch_xpu_available,
+)
 
 
 if is_torch_available():
@@ -38,7 +43,7 @@
  from accelerate import init_empty_weights
 
 
-@require_torch_gpu
+@require_torch_accelerator
 class FbgemmFp8ConfigTest(unittest.TestCase):
  def test_to_dict(self):
  """
@@ -62,8 +67,8 @@ def test_from_dict(self):
 
 
 @slow
-@require_torch_gpu
-@require_fbgemm_gpu
+@require_torch_accelerator
+@unittest.skipIf(not is_torch_xpu_available() and not is_fbgemm_gpu_available(), "test requires fbgemm-gpu or xpu")
 @require_accelerate
 @require_read_token
 class FbgemmFp8Test(unittest.TestCase):
@@ -76,10 +81,11 @@ class FbgemmFp8Test(unittest.TestCase):
  [
  "What are we having for dinner?\nI'm having a steak and a salad",
  "What are we having for dinner? I don’t know. What are we having",
+ "What are we having for dinner? I don’t know, what are you having",
  ]
  )
 
- device_map = "cuda"
+ device_map = "xpu" if is_torch_xpu_available() else "cuda"
 
  offload_device_map = {
  "model.embed_tokens": 0,
@@ -176,6 +182,7 @@ def test_quantized_model_conversion(self):
 
  self.assertEqual(nb_linears - 24, nb_fbgemm_linear)
 
+ @require_deterministic_for_xpu
  def test_quantized_model(self):
  """
  Simple test that checks if the quantized model is working properly
@@ -185,6 +192,7 @@ def test_quantized_model(self):
  output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
  self.assertTrue(self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUT)
 
+ @require_deterministic_for_xpu
  def test_save_pretrained(self):
  """
  Simple test that checks if the quantized model is working properly after being saved and loaded
@@ -219,7 +227,8 @@ def test_change_loading_attributes(self):
  output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
  self.assertTrue(self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUT)
 
- @require_torch_multi_gpu
+ @require_torch_multi_accelerator
+ @require_deterministic_for_xpu
  def test_quantized_model_multi_gpu(self):
  """
  Simple test that checks if the quantized model is working properly with multiple GPUs
@@ -248,6 +257,7 @@ def test_quantized_model_offload(self):
  self.model_name, device_map=self.offload_device_map, quantization_config=quantization_config
  )
 
+ @require_deterministic_for_xpu
  def test_save_pretrained_offload(self):
  """
  Simple test that checks if the saved quantized model is working properly cpu/disk offload
@@ -261,7 +271,8 @@ def test_save_pretrained_offload(self):
  output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
  self.assertTrue(self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUT)
 
- @require_torch_multi_gpu
+ @require_torch_multi_accelerator
+ @require_deterministic_for_xpu
  def test_save_pretrained_multi_gpu(self):
  """
  Simple test that checks if the quantized model is working properly after being saved and loaded
@@ -278,9 +289,9 @@ def test_save_pretrained_multi_gpu(self):
  self.assertTrue(self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUT)
 
 
-@require_torch_gpu
+@require_torch_accelerator
 @require_accelerate
-@require_fbgemm_gpu
+@unittest.skipIf(not is_torch_xpu_available() and not is_fbgemm_gpu_available(), "test requires fbgemm-gpu or xpu")
 class FbgemmFp8LinearTest(unittest.TestCase):
  def test_linear_preserves_shape(self):
  """