pytorch
diff --git a/‎backends/openvino/quantizer/llm_compression.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/openvino/quantizer/llm_compression.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/openvino/quantizer/quantizer.py‎
Lines changed: 9 additions & 6 deletions b/‎backends/openvino/quantizer/quantizer.py‎
Lines changed: 9 additions & 6 deletions
@@ -26,6 +26,7 @@ def get_calibration_data(
  This method is used to obtain calibration data from a prompt so that the algorithm
  is calibrated not only with the dataset but also the inputs which are output-ed by
  the model.
+ Currently, this method is only tested with Llama models.
  """
  # TODO: change criteria & support batch inputs if necessary
  pos = torch.tensor(0, dtype=torch.int64)
@@ -53,6 +54,7 @@ def get_calibration_data(
 def transform_fn(token_pos_map: tuple[int, int]):
  """
  Transforms and returns input from dataset so that it is acceptable by the model
+ Currently, this method is only tested with Llama models
 
  :param token_pos_map: This input contains the posiition and its token ID
  """
 
@@ -134,12 +134,15 @@ def __init__(
 
  def get_weights_compression_config(self) -> Dict[str, Any]:
  """
- Returns the config attributes like all_layers, group_size, backup_mode and Quantization mode
- from the quantizer's internal algorithm. These attributes are defined when initializing the quantizer
- and is used by the compress pt2e's experimental algorithm.
-
- :return: A dictionary containing elements which are defined in the quantizer and their value from the
- internal algorithm.
+ Returns a dictionary with all_layers, group_size, backup_mode and Quantization mode parameters
+ used by the compress_pt2e weight compression algorithm.
+ 
+ :return: A dictionary containing:
+ 1. mode: Quantization mode. One of INT4 Sym, INT4 Asym, INT8 Sym, INT8 Asym. 
+ 2. group_size: group size to be used for group-wise compression.
+ 3. all_layers: Indicates whether embeddings and last MatMul layers should be compressed to a primary
+ precision. By default, the backup precision is assigned for the embeddings and last MatMul layers. 
+ 4. backup_mode: Defines a backup mode for mixed-precision weight compression.
  """
  quantizer_initialized_algo_attributes = {
  "mode": self._algo.mode,