pytorch
diff --git a/‎test/quantization/quantize_/workflows/float8/test_sparse_2x4_cutlass_float8_tensor.py‎
Lines changed: 126 additions & 0 deletions b/‎test/quantization/quantize_/workflows/float8/test_sparse_2x4_cutlass_float8_tensor.py‎
Lines changed: 126 additions & 0 deletions
diff --git a/‎torchao/quantization/quant_api.py‎
Lines changed: 29 additions & 15 deletions b/‎torchao/quantization/quant_api.py‎
Lines changed: 29 additions & 15 deletions
diff --git a/‎torchao/quantization/quantize_/workflows/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎torchao/quantization/quantize_/workflows/__init__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎torchao/quantization/quantize_/workflows/float8/float8_packing_format.py‎
Lines changed: 38 additions & 0 deletions b/‎torchao/quantization/quantize_/workflows/float8/float8_packing_format.py‎
Lines changed: 38 additions & 0 deletions
@@ -0,0 +1,126 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import logging
+import unittest
+
+import torch
+from torch import nn
+from torch.testing._internal import common_utils
+
+from torchao.quantization import (
+ Float8DynamicActivationFloat8WeightConfig,
+)
+from torchao.quantization.granularity import PerRow
+from torchao.quantization.quant_api import (
+ quantize_,
+)
+from torchao.quantization.quantize_.workflows import (
+ Float8PackingFormat,
+)
+from torchao.quantization.utils import compute_error
+from torchao.sparsity import apply_fake_sparsity
+from torchao.utils import is_sm_at_least_90
+
+logging.basicConfig(
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+)
+
+
+class TestSparse2x4Float8Tensor(common_utils.TestCase):
+ @unittest.skipIf(not is_sm_at_least_90(), "Need H100 to run")
+ @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+ @common_utils.parametrize("compile", [True, False])
+ def test_fp8_cutlass_sparse(self, compile):
+ with torch.inference_mode():
+ input = torch.rand((256, 256), dtype=torch.bfloat16, device="cuda")
+ model = (
+ nn.Sequential(
+ nn.Linear(256, 1024),
+ nn.Linear(1024, 256),
+ )
+ .bfloat16()
+ .cuda()
+ .eval()
+ )
+
+ apply_fake_sparsity(model)
+ baseline_result = model(input)
+ model_copy = copy.deepcopy(model)
+
+ # Quantized
+ quantize_(model_copy, Float8DynamicActivationFloat8WeightConfig())
+ dense_result = model_copy(input)
+ dense_sqnr = compute_error(baseline_result, dense_result)
+
+ # Sparse + quantized
+ quantize_(
+ model,
+ Float8DynamicActivationFloat8WeightConfig(
+ version=2,
+ packing_format=Float8PackingFormat.SPARSE_CUTLASS,
+ granularity=PerRow(),
+ ),
+ )
+ if compile:
+ model = torch.compile(model)
+ sparse_result = model(input)
+ sparse_sqnr = compute_error(baseline_result, sparse_result)
+
+ self.assertEqual(dense_sqnr, sparse_sqnr)
+
+ @unittest.skipIf(not is_sm_at_least_90(), "Need H100 to run")
+ @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+ def test_fp8_cutlass_sparse_lowering_op_clone(self):
+ with torch.inference_mode():
+ model = nn.Linear(256, 1024).half().cuda().eval()
+ apply_fake_sparsity(model)
+ quantize_(
+ model,
+ Float8DynamicActivationFloat8WeightConfig(
+ version=2,
+ packing_format=Float8PackingFormat.SPARSE_CUTLASS,
+ granularity=PerRow(),
+ ),
+ )
+
+ original = model.weight.dequantize()
+ cloned = model.weight.clone().dequantize()
+
+ for o, c in zip(original, cloned):
+ self.assertEqual(o, c)
+
+ @unittest.skipIf(not is_sm_at_least_90(), "Need H100 to run")
+ @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+ def test_fp8_cutlass_sparse_lowering_op_to(self):
+ # Need to run with inference mode to avoid dispatching to `aten.to_copy`
+ with torch.inference_mode():
+ model = nn.Linear(256, 1024).half().cuda().eval()
+ apply_fake_sparsity(model)
+ model_copy = copy.deepcopy(model)
+ expected = model_copy.weight.to(dtype=torch.float)
+
+ quantize_(
+ model,
+ Float8DynamicActivationFloat8WeightConfig(
+ version=2,
+ packing_format=Float8PackingFormat.SPARSE_CUTLASS,
+ granularity=PerRow(),
+ ),
+ )
+
+ original = torch.ops.aten.to.dtype_layout(
+ model.weight,
+ dtype=torch.float,
+ layout=torch.strided,
+ )
+ torch.testing.assert_close(expected, original, atol=1e-1, rtol=1e-1)
+
+
+common_utils.instantiate_parametrized_tests(TestSparse2x4Float8Tensor)
+
+if __name__ == "__main__":
+ unittest.main()
@@ -81,6 +81,7 @@
  KernelPreference,
 )
 from torchao.quantization.quantize_.workflows import (
+ Float8PackingFormat,
  Float8Tensor,
  Int4ChooseQParamsAlgorithm,
  Int4MarlinSparseTensor,
@@ -96,6 +97,7 @@
  IntxUnpackedToInt8Tensor,
  QuantizeTensorToFloat8Kwargs,
  QuantizeTensorToInt8Kwargs,
+ Sparse2x4CUTLASSFloat8Tensor,
 )
 from torchao.quantization.transform_module import (
  _QUANTIZE_CONFIG_HANDLER,
@@ -1588,6 +1590,7 @@ class Float8DynamicActivationFloat8WeightConfig(AOBaseConfig):
  activation_dtype: torch.dtype = e4m3_dtype
  weight_dtype: torch.dtype = e4m3_dtype
  granularity: Optional[Union[FP8Granularity, List[FP8Granularity]]] = None
+ packing_format: Optional[Float8PackingFormat] = Float8PackingFormat.PLAIN
  mm_config: Optional[Float8MMConfig] = None
  activation_value_lb: Optional[float] = None
  activation_value_ub: Optional[float] = None
@@ -1625,6 +1628,7 @@ def _float8_dynamic_activation_float8_weight_quantize_tensor(weight, config):
  activation_value_lb = config.activation_value_lb
  activation_value_ub = config.activation_value_ub
  kernel_preference = config.kernel_preference
+ packing_format = config.packing_format
 
  # Ensure works on device
  _check_hardware_support(granularity)
@@ -1651,31 +1655,41 @@ def _float8_dynamic_activation_float8_weight_quantize_tensor(weight, config):
  # TODO(future PR): this should really throw an exception instead of silently
  # not doing what the user asked
  return weight
-
- if isinstance(weight_granularity, PerRow):
+ assert config.version == 2, f"Unexpected version: {config.version}"
+ if packing_format == Float8PackingFormat.PLAIN and isinstance(
+ weight_granularity, PerRow
+ ):
  assert weight.dtype == torch.bfloat16, (
  "PerRow quantization only works for bfloat16 precision input weight"
  )
-
- assert config.version == 2, f"Unexpected version: {config.version}"
  act_quant_kwargs = QuantizeTensorToFloat8Kwargs(
  activation_dtype,
  activation_granularity,
  hp_value_lb=activation_value_lb,
  hp_value_ub=activation_value_ub,
  kernel_preference=kernel_preference,
  )
-
- quantized_weight = Float8Tensor.from_hp(
- weight,
- float8_dtype=weight_dtype,
- granularity=weight_granularity,
- mm_config=mm_config,
- kernel_preference=kernel_preference,
- act_quant_kwargs=act_quant_kwargs,
- )
-
- return quantized_weight
+ if packing_format == Float8PackingFormat.PLAIN:
+ quantized_weight = Float8Tensor.from_hp(
+ weight,
+ float8_dtype=weight_dtype,
+ granularity=weight_granularity,
+ mm_config=mm_config,
+ kernel_preference=kernel_preference,
+ act_quant_kwargs=act_quant_kwargs,
+ )
+ return quantized_weight
+ elif packing_format == Float8PackingFormat.SPARSE_CUTLASS:
+ assert isinstance(weight_granularity, PerRow), (
+ "Sparse packing format only supports per-row quantization"
+ )
+ quantized_weight = Sparse2x4CUTLASSFloat8Tensor.from_hp(
+ weight,
+ float8_dtype=weight_dtype,
+ granularity=weight_granularity,
+ act_quant_kwargs=act_quant_kwargs,
+ )
+ return quantized_weight
 
 
 @register_quantize_module_handler(Float8DynamicActivationFloat8WeightConfig)
 
@@ -1,7 +1,13 @@
+from .float8.float8_packing_format import (
+ Float8PackingFormat,
+)
 from .float8.float8_tensor import (
  Float8Tensor,
  QuantizeTensorToFloat8Kwargs,
 )
+from .float8.sparse_2x4_cutlass_float8_tensor import (
+ Sparse2x4CUTLASSFloat8Tensor,
+)
 from .int4.int4_choose_qparams_algorithm import Int4ChooseQParamsAlgorithm
 from .int4.int4_marlin_sparse_tensor import (
  Int4MarlinSparseTensor,
@@ -41,6 +47,8 @@
  "Int8Tensor",
  "QuantizeTensorToInt8Kwargs",
  "Float8Tensor",
+ "Sparse2x4CUTLASSFloat8Tensor",
+ "Float8PackingFormat",
  "QuantizeTensorToFloat8Kwargs",
  "Int8Tensor",
  "QuantizeTensorToInt8Kwargs",
 
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from enum import Enum
+
+import torch
+
+__all__ = [
+ "Float8PackingFormat",
+]
+
+
+# can switch to StrEnum (https://docs.python.org/3/library/enum.html#enum.StrEnum)
+# after python 3.10 is end of life (https://devguide.python.org/versions/)
+class Float8PackingFormat(str, Enum):
+ """
+ plain packing format for Float8Tensor will lay out elements in Tensor sequentially,
+ for example: for a Tensor of shape (4, 6):
+ a_0_0, a_0_1, ..., a_0_5,
+ ...
+ a_3_0, a_3_1, ..., a_3_5
+ """
+
+ PLAIN = "plain"
+ """
+ Sparse packing format for 2:4 sparsity + FP8 quantization
+
+ SPARSE_CUTLASS will pack the quantized_data into two tensors, qdata and sparse_metadata, for the specified values and metadata respectively.
+ This packing format will dispatch to `rowwise_scaled_linear_sparse_cutlass_f8f8`, which will fuse the per-row scaling into the sparse matmul.
+ """
+ SPARSE_CUTLASS = "sparse_cutlass"
+
+
+torch.serialization.add_safe_globals([Float8PackingFormat])