codewithdark-git · codewithdark-git · May 21, 2025 · May 21, 2025
diff --git a/docs/api_reference/quantization.rst b/docs/api_reference/quantization.rst
diff --git a/quantllm/api.py b/quantllm/api.py
@@ -0,0 +1,124 @@
+from typing import Optional, Dict, Any, Tuple
+from transformers import PreTrainedModel
+from .quant.awq import AWQQuantizer
+from .quant.gptq import GPTQQuantizer
+from .quant.gguf import GGUFQuantizer
+from .trainer.logger import TrainingLogger
+
+class QuantizerFactory:
+ @staticmethod
+ def quantize_from_pretrained(
+ model_name_or_path: str,
+ method: str,
+ quant_config_dict: Optional[Dict[str, Any]] = None,
+ calibration_data: Optional[Any] = None, # Typically torch.Tensor or similar
+ calibration_steps: Optional[int] = 100, # Specific to AWQ's quantize method
+ device: Optional[str] = None # Explicit device control
+ ) -> Tuple[PreTrainedModel, Any]: # Returns (quantized_model, tokenizer)
+ """
+ Loads a model from Hugging Face, quantizes it using the specified method,
+ and returns the quantized model and its tokenizer.
+
+ Args:
+ model_name_or_path (str): Hugging Face model ID or local path.
+ method (str): Quantization method to use ('awq', 'gptq', 'gguf').
+ quant_config_dict (Optional[Dict[str, Any]]): Dictionary with quantization parameters.
+ Common keys: 'bits', 'group_size', 'batch_size' (for quantizer init).
+ AWQ specific: 'zero_point', 'awq_version' (maps to 'version' in AWQQuantizer).
+ GPTQ specific: 'actorder', 'percdamp', 'sym'.
+ GGUF specific: 'use_packed', 'cpu_offload', 'desc_act', 'desc_ten', 'legacy_format'.
+ calibration_data (Optional[Any]): Calibration data required for quantization.
+ calibration_steps (Optional[int]): Number of calibration steps, primarily for AWQ's
+ quantize() method. Defaults to 100.
+ device (Optional[str]): Device to run quantization on ('cpu', 'cuda', 'cuda:x'). 
+ If None, default device selection logic in BaseQuantizer is used.
+
+ Returns:
+ Tuple[PreTrainedModel, Any]: The quantized model and its associated tokenizer.
+
+ Raises:
+ ValueError: If an unsupported quantization method is specified or essential parameters are missing.
+ RuntimeError: If quantization fails for some reason.
+ """
+ logger = TrainingLogger() 
+ if quant_config_dict is None:
+ quant_config_dict = {}
+
+ method_lower = method.lower()
+ logger.log_info(f"Attempting to quantize model '{model_name_or_path}' using method: {method_lower}")
+
+ bits = quant_config_dict.get('bits', 4)
+ group_size = quant_config_dict.get('group_size', 128)
+ quantizer_batch_size = quant_config_dict.get('batch_size', 4) 
+
+ quantizer = None
+
+ if method_lower == 'awq':
+ awq_zero_point = quant_config_dict.get('zero_point', True)
+ awq_version = quant_config_dict.get('awq_version', 'v2')
+
+ quantizer = AWQQuantizer(
+ model_or_model_name_or_path=model_name_or_path,
+ bits=bits,
+ group_size=group_size,
+ zero_point=awq_zero_point,
+ version=awq_version,
+ batch_size=quantizer_batch_size,
+ device=device
+ )
+ logger.log_info(f"Quantizing with AWQ... Bits: {bits}, Group Size: {group_size}, Zero Point: {awq_zero_point}, Version: {awq_version}")
+ quantizer.quantize( # Call quantize, model is updated in place
+ calibration_data=calibration_data,
+ calibration_steps=calibration_steps
+ )
+
+ elif method_lower == 'gptq':
+ gptq_actorder = quant_config_dict.get('actorder', True)
+ gptq_percdamp = quant_config_dict.get('percdamp', 0.01)
+ gptq_sym = quant_config_dict.get('sym', True)
+
+ quantizer = GPTQQuantizer(
+ model_or_model_name_or_path=model_name_or_path,
+ bits=bits,
+ group_size=group_size,
+ actorder=gptq_actorder,
+ percdamp=gptq_percdamp,
+ sym=gptq_sym,
+ batch_size=quantizer_batch_size,
+ device=device
+ )
+ logger.log_info(f"Quantizing with GPTQ... Bits: {bits}, Group Size: {group_size}, ActOrder: {gptq_actorder}, Sym: {gptq_sym}")
+ quantizer.quantize(calibration_data=calibration_data) # Model updated in place
+
+ elif method_lower == 'gguf':
+ gguf_use_packed = quant_config_dict.get('use_packed', True)
+ gguf_cpu_offload = quant_config_dict.get('cpu_offload', False)
+ gguf_desc_act = quant_config_dict.get('desc_act', False)
+ gguf_desc_ten = quant_config_dict.get('desc_ten', False)
+ gguf_legacy_format = quant_config_dict.get('legacy_format', False)
+
+ quantizer = GGUFQuantizer(
+ model_or_model_name_or_path=model_name_or_path,
+ bits=bits,
+ group_size=group_size,
+ use_packed=gguf_use_packed,
+ cpu_offload=gguf_cpu_offload,
+ desc_act=gguf_desc_act,
+ desc_ten=gguf_desc_ten,
+ legacy_format=gguf_legacy_format,
+ batch_size=quantizer_batch_size,
+ device=device
+ )
+ logger.log_info(f"Quantizing with GGUF... Bits: {bits}, Group Size: {group_size}, Packed: {gguf_use_packed}, CPU Offload: {gguf_cpu_offload}")
+ quantizer.quantize(calibration_data=calibration_data) # Model updated in place
+
+ else:
+ logger.log_error(f"Unsupported quantization method: {method}")
+ raise ValueError(f"Unsupported quantization method: {method}. Supported methods are 'awq', 'gptq', 'gguf'.")
+
+ if quantizer is None or quantizer.model is None:
+ logger.log_error(f"Failed to initialize quantizer or obtain quantized model for method: {method}")
+ raise RuntimeError(f"Quantization failed for method: {method}. Quantizer or model is None.")
+
+ logger.log_info(f"Successfully quantized model with method: {method_lower}")
+ return quantizer.model, quantizer.tokenizer
diff --git a/quantllm/quant/awq.py b/quantllm/quant/awq.py
@@ -6,14 +6,14 @@
 import numpy as np
 from typing import Optional, Dict, Any, List, Union, Tuple
 from transformers import PreTrainedModel
-from .quantization_engine import BaseQuantizer, QuantizationConfig, QuantizedLinear
+from .quantization_engine import move_to_device, BaseQuantizer, QuantizationConfig, QuantizedLinear
 
 class AWQQuantizer(BaseQuantizer):
  """AWQ quantization implementation with memory-efficient processing."""
 
  def __init__(
  self,
- model: PreTrainedModel,
+ model_or_model_name_or_path: Union[str, PreTrainedModel], # Changed parameter name
  bits: int = 4,
  group_size: int = 128, 
  zero_point: bool = True,
@@ -27,7 +27,8 @@ def __init__(
  Initializes the AWQQuantizer.
 
  Args:
- model (PreTrainedModel): The model to be quantized.
+ model_or_model_name_or_path (Union[str, PreTrainedModel]): 
+ The Hugging Face model name/path or a PreTrainedModel instance to be quantized.
  bits (int, optional): Number of bits for quantization. Defaults to 4.
  group_size (int, optional): Size of the quantization group. Defaults to 128.
  zero_point (bool, optional): Whether to use zero-point quantization for activations. Defaults to True.
@@ -39,7 +40,9 @@ def __init__(
  The device for quantization operations ('cpu', 'cuda', etc.). 
  Inherited from BaseQuantizer. Defaults to None (auto-detection).
  """
- super().__init__(model=model, bits=bits, device=device)
+ # Pass all relevant kwargs to BaseQuantizer
+ # AWQQuantizer specific args are handled here.
+ super().__init__(model_or_model_name_or_path=model_or_model_name_or_path, bits=bits, device=device)
  self.group_size = group_size
  self.zero_point = zero_point
  self.scale_dtype = scale_dtype
@@ -101,7 +104,18 @@ def quantize(
 
  self._clear_memory()
 
+ # Update model config with quantization parameters
+ awq_specific_params = {
+ "zero_point": self.zero_point,
+ "version": self.version,
+ "scale_dtype": self.scale_dtype, # Added from __init__
+ "enable_mnn_kernel": self.enable_mnn_kernel # Added from __init__
+ # batch_size is more of a process param, not a model config param usually
+ }
+ self._update_model_config_with_quant_params("awq", awq_specific_params)
+
  return self.model
+
  def _collect_activation_stats(
  self,
  data: torch.Tensor # Removed num_steps parameter

diff --git a/quantllm/quant/gguf.py b/quantllm/quant/gguf.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 from typing import Optional, Dict, Any, List, Union, Tuple
 from transformers import PreTrainedModel
-from .quantization_engine import BaseQuantizer, QuantizationConfig, QuantizedLinear
+from .quantization_engine import move_to_device, BaseQuantizer, QuantizationConfig, QuantizedLinear
 
 try:
  import ctransformers
@@ -19,7 +19,7 @@ class GGUFQuantizer(BaseQuantizer):
 
  def __init__(
  self,
- model: PreTrainedModel,
+ model_or_model_name_or_path: Union[str, PreTrainedModel], # Changed parameter name
  bits: int = 4,
  group_size: int = 32,
  desc_act: bool = False,
@@ -34,7 +34,8 @@ def __init__(
  Initializes the GGUFQuantizer.
 
  Args:
- model (PreTrainedModel): The model to be quantized.
+ model_or_model_name_or_path (Union[str, PreTrainedModel]): 
+ The Hugging Face model name/path or a PreTrainedModel instance to be quantized.
  bits (int, optional): Number of bits for quantization. Defaults to 4.
  group_size (int, optional): Size of the quantization group. Defaults to 32.
  desc_act (bool, optional): Whether to describe activations in GGUF metadata. Defaults to False.
@@ -52,7 +53,7 @@ def __init__(
  if not CT_AVAILABLE:
  raise ImportError("CTransformers is required for GGUF quantization. Install with: pip install ctransformers")
 
- super().__init__(model=model, bits=bits, device=device)
+ super().__init__(model_or_model_name_or_path=model_or_model_name_or_path, bits=bits, device=device)
  self.group_size = group_size
  self.desc_act = desc_act
  self.desc_ten = desc_ten
@@ -94,9 +95,20 @@ def quantize(
  setattr(self.model, name, quantized)
 
  self._clear_memory()
+
+ # Update model config with quantization parameters
+ gguf_specific_params = {
+ "use_packed": self.use_packed,
+ "cpu_offload": self.cpu_offload,
+ "desc_act": self.desc_act,
+ "desc_ten": self.desc_ten,
+ "legacy_format": self.legacy_format
+ # group_size is handled by BaseQuantizer if present as self.group_size
+ }
+ self._update_model_config_with_quant_params("gguf", gguf_specific_params)
 
  return self.model
- 
+
  def _collect_stats(self, data: torch.Tensor) -> Dict[str, Dict[str, torch.Tensor]]:
  """Collect statistics for quantization with memory-efficient batch processing."""
  stats = {}

diff --git a/quantllm/quant/gptq.py b/quantllm/quant/gptq.py
@@ -6,14 +6,14 @@
 import torch.nn as nn
 from typing import Optional, Dict, Any, List, Union
 from transformers import PreTrainedModel
-from .quantization_engine import BaseQuantizer, QuantizationConfig, QuantizedLinear
+from .quantization_engine import move_to_device, BaseQuantizer, QuantizationConfig, QuantizedLinear
 
 class GPTQQuantizer(BaseQuantizer):
  """GPTQ quantization implementation with memory-efficient processing."""
 
  def __init__(
  self,
- model: PreTrainedModel,
+ model_or_model_name_or_path: Union[str, PreTrainedModel], # Changed parameter name
  bits: int = 4,
  group_size: int = 128,
  actorder: bool = False,
@@ -28,7 +28,8 @@ def __init__(
  Initializes the GPTQQuantizer.
 
  Args:
- model (PreTrainedModel): The model to be quantized.
+ model_or_model_name_or_path (Union[str, PreTrainedModel]): 
+ The Hugging Face model name/path or a PreTrainedModel instance to be quantized.
  bits (int, optional): Number of bits for quantization. Defaults to 4.
  group_size (int, optional): Size of the quantization group. Defaults to 128.
  actorder (bool, optional): Whether to use activation order for columns. Defaults to False.
@@ -43,7 +44,7 @@ def __init__(
  The device for quantization operations ('cpu', 'cuda', etc.). 
  Inherited from BaseQuantizer. Defaults to None (auto-detection).
  """
- super().__init__(model=model, bits=bits, device=device)
+ super().__init__(model_or_model_name_or_path=model_or_model_name_or_path, bits=bits, device=device)
  self.group_size = group_size
  self.actorder = actorder
  self.allow_mixed_bits = allow_mixed_bits
@@ -101,8 +102,18 @@ def quantize(self, calibration_data: Optional[torch.Tensor] = None) -> PreTraine
  self._clear_memory()
  del self.H[name]
 
+ # Update model config with quantization parameters
+ gptq_specific_params = {
+ "actorder": self.actorder,
+ "sym": self.sym,
+ "percdamp": self.percdamp,
+ "allow_mixed_bits": self.allow_mixed_bits # Added from __init__
+ # use_triton is more of a runtime/environment flag, might not be essential in model config
+ }
+ self._update_model_config_with_quant_params("gptq", gptq_specific_params)
+
  return self.model
- 
+
  def _compute_hessian(self, layer: nn.Linear, data: torch.Tensor) -> torch.Tensor:
  """Compute Hessian approximation for a layer with memory-efficient processing."""
  n = layer.in_features