Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
298 changes: 201 additions & 97 deletions docs/api_reference/quantization.rst

Large diffs are not rendered by default.

124 changes: 124 additions & 0 deletions quantllm/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
from typing import Optional, Dict, Any, Tuple
from transformers import PreTrainedModel
from .quant.awq import AWQQuantizer
from .quant.gptq import GPTQQuantizer
from .quant.gguf import GGUFQuantizer
from .trainer.logger import TrainingLogger

class QuantizerFactory:
@staticmethod
def quantize_from_pretrained(
model_name_or_path: str,
method: str,
quant_config_dict: Optional[Dict[str, Any]] = None,
calibration_data: Optional[Any] = None, # Typically torch.Tensor or similar
calibration_steps: Optional[int] = 100, # Specific to AWQ's quantize method
device: Optional[str] = None # Explicit device control
) -> Tuple[PreTrainedModel, Any]: # Returns (quantized_model, tokenizer)
"""
Loads a model from Hugging Face, quantizes it using the specified method,
and returns the quantized model and its tokenizer.

Args:
model_name_or_path (str): Hugging Face model ID or local path.
method (str): Quantization method to use ('awq', 'gptq', 'gguf').
quant_config_dict (Optional[Dict[str, Any]]): Dictionary with quantization parameters.
Common keys: 'bits', 'group_size', 'batch_size' (for quantizer init).
AWQ specific: 'zero_point', 'awq_version' (maps to 'version' in AWQQuantizer).
GPTQ specific: 'actorder', 'percdamp', 'sym'.
GGUF specific: 'use_packed', 'cpu_offload', 'desc_act', 'desc_ten', 'legacy_format'.
calibration_data (Optional[Any]): Calibration data required for quantization.
calibration_steps (Optional[int]): Number of calibration steps, primarily for AWQ's
quantize() method. Defaults to 100.
device (Optional[str]): Device to run quantization on ('cpu', 'cuda', 'cuda:x').
If None, default device selection logic in BaseQuantizer is used.

Returns:
Tuple[PreTrainedModel, Any]: The quantized model and its associated tokenizer.

Raises:
ValueError: If an unsupported quantization method is specified or essential parameters are missing.
RuntimeError: If quantization fails for some reason.
"""
logger = TrainingLogger()
if quant_config_dict is None:
quant_config_dict = {}

method_lower = method.lower()
logger.log_info(f"Attempting to quantize model '{model_name_or_path}' using method: {method_lower}")

bits = quant_config_dict.get('bits', 4)
group_size = quant_config_dict.get('group_size', 128)
quantizer_batch_size = quant_config_dict.get('batch_size', 4)

quantizer = None

if method_lower == 'awq':
awq_zero_point = quant_config_dict.get('zero_point', True)
awq_version = quant_config_dict.get('awq_version', 'v2')

quantizer = AWQQuantizer(
model_or_model_name_or_path=model_name_or_path,
bits=bits,
group_size=group_size,
zero_point=awq_zero_point,
version=awq_version,
batch_size=quantizer_batch_size,
device=device
)
logger.log_info(f"Quantizing with AWQ... Bits: {bits}, Group Size: {group_size}, Zero Point: {awq_zero_point}, Version: {awq_version}")
quantizer.quantize( # Call quantize, model is updated in place
calibration_data=calibration_data,
calibration_steps=calibration_steps
)

elif method_lower == 'gptq':
gptq_actorder = quant_config_dict.get('actorder', True)
gptq_percdamp = quant_config_dict.get('percdamp', 0.01)
gptq_sym = quant_config_dict.get('sym', True)

quantizer = GPTQQuantizer(
model_or_model_name_or_path=model_name_or_path,
bits=bits,
group_size=group_size,
actorder=gptq_actorder,
percdamp=gptq_percdamp,
sym=gptq_sym,
batch_size=quantizer_batch_size,
device=device
)
logger.log_info(f"Quantizing with GPTQ... Bits: {bits}, Group Size: {group_size}, ActOrder: {gptq_actorder}, Sym: {gptq_sym}")
quantizer.quantize(calibration_data=calibration_data) # Model updated in place

elif method_lower == 'gguf':
gguf_use_packed = quant_config_dict.get('use_packed', True)
gguf_cpu_offload = quant_config_dict.get('cpu_offload', False)
gguf_desc_act = quant_config_dict.get('desc_act', False)
gguf_desc_ten = quant_config_dict.get('desc_ten', False)
gguf_legacy_format = quant_config_dict.get('legacy_format', False)

quantizer = GGUFQuantizer(
model_or_model_name_or_path=model_name_or_path,
bits=bits,
group_size=group_size,
use_packed=gguf_use_packed,
cpu_offload=gguf_cpu_offload,
desc_act=gguf_desc_act,
desc_ten=gguf_desc_ten,
legacy_format=gguf_legacy_format,
batch_size=quantizer_batch_size,
device=device
)
logger.log_info(f"Quantizing with GGUF... Bits: {bits}, Group Size: {group_size}, Packed: {gguf_use_packed}, CPU Offload: {gguf_cpu_offload}")
quantizer.quantize(calibration_data=calibration_data) # Model updated in place

else:
logger.log_error(f"Unsupported quantization method: {method}")
raise ValueError(f"Unsupported quantization method: {method}. Supported methods are 'awq', 'gptq', 'gguf'.")

if quantizer is None or quantizer.model is None:
logger.log_error(f"Failed to initialize quantizer or obtain quantized model for method: {method}")
raise RuntimeError(f"Quantization failed for method: {method}. Quantizer or model is None.")

logger.log_info(f"Successfully quantized model with method: {method_lower}")
return quantizer.model, quantizer.tokenizer
22 changes: 18 additions & 4 deletions quantllm/quant/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
import numpy as np
from typing import Optional, Dict, Any, List, Union, Tuple
from transformers import PreTrainedModel
from .quantization_engine import BaseQuantizer, QuantizationConfig, QuantizedLinear
from .quantization_engine import move_to_device, BaseQuantizer, QuantizationConfig, QuantizedLinear

class AWQQuantizer(BaseQuantizer):
"""AWQ quantization implementation with memory-efficient processing."""

def __init__(
self,
model: PreTrainedModel,
model_or_model_name_or_path: Union[str, PreTrainedModel], # Changed parameter name
bits: int = 4,
group_size: int = 128,
zero_point: bool = True,
Expand All @@ -27,7 +27,8 @@ def __init__(
Initializes the AWQQuantizer.

Args:
model (PreTrainedModel): The model to be quantized.
model_or_model_name_or_path (Union[str, PreTrainedModel]):
The Hugging Face model name/path or a PreTrainedModel instance to be quantized.
bits (int, optional): Number of bits for quantization. Defaults to 4.
group_size (int, optional): Size of the quantization group. Defaults to 128.
zero_point (bool, optional): Whether to use zero-point quantization for activations. Defaults to True.
Expand All @@ -39,7 +40,9 @@ def __init__(
The device for quantization operations ('cpu', 'cuda', etc.).
Inherited from BaseQuantizer. Defaults to None (auto-detection).
"""
super().__init__(model=model, bits=bits, device=device)
# Pass all relevant kwargs to BaseQuantizer
# AWQQuantizer specific args are handled here.
super().__init__(model_or_model_name_or_path=model_or_model_name_or_path, bits=bits, device=device)
self.group_size = group_size
self.zero_point = zero_point
self.scale_dtype = scale_dtype
Expand Down Expand Up @@ -101,7 +104,18 @@ def quantize(

self._clear_memory()

# Update model config with quantization parameters
awq_specific_params = {
"zero_point": self.zero_point,
"version": self.version,
"scale_dtype": self.scale_dtype, # Added from __init__
"enable_mnn_kernel": self.enable_mnn_kernel # Added from __init__
# batch_size is more of a process param, not a model config param usually
}
self._update_model_config_with_quant_params("awq", awq_specific_params)

return self.model

def _collect_activation_stats(
self,
data: torch.Tensor # Removed num_steps parameter
Expand Down
22 changes: 17 additions & 5 deletions quantllm/quant/gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import torch.nn as nn
from typing import Optional, Dict, Any, List, Union, Tuple
from transformers import PreTrainedModel
from .quantization_engine import BaseQuantizer, QuantizationConfig, QuantizedLinear
from .quantization_engine import move_to_device, BaseQuantizer, QuantizationConfig, QuantizedLinear

try:
import ctransformers
Expand All @@ -19,7 +19,7 @@ class GGUFQuantizer(BaseQuantizer):

def __init__(
self,
model: PreTrainedModel,
model_or_model_name_or_path: Union[str, PreTrainedModel], # Changed parameter name
bits: int = 4,
group_size: int = 32,
desc_act: bool = False,
Expand All @@ -34,7 +34,8 @@ def __init__(
Initializes the GGUFQuantizer.

Args:
model (PreTrainedModel): The model to be quantized.
model_or_model_name_or_path (Union[str, PreTrainedModel]):
The Hugging Face model name/path or a PreTrainedModel instance to be quantized.
bits (int, optional): Number of bits for quantization. Defaults to 4.
group_size (int, optional): Size of the quantization group. Defaults to 32.
desc_act (bool, optional): Whether to describe activations in GGUF metadata. Defaults to False.
Expand All @@ -52,7 +53,7 @@ def __init__(
if not CT_AVAILABLE:
raise ImportError("CTransformers is required for GGUF quantization. Install with: pip install ctransformers")

super().__init__(model=model, bits=bits, device=device)
super().__init__(model_or_model_name_or_path=model_or_model_name_or_path, bits=bits, device=device)
self.group_size = group_size
self.desc_act = desc_act
self.desc_ten = desc_ten
Expand Down Expand Up @@ -94,9 +95,20 @@ def quantize(
setattr(self.model, name, quantized)

self._clear_memory()

# Update model config with quantization parameters
gguf_specific_params = {
"use_packed": self.use_packed,
"cpu_offload": self.cpu_offload,
"desc_act": self.desc_act,
"desc_ten": self.desc_ten,
"legacy_format": self.legacy_format
# group_size is handled by BaseQuantizer if present as self.group_size
}
self._update_model_config_with_quant_params("gguf", gguf_specific_params)

return self.model

def _collect_stats(self, data: torch.Tensor) -> Dict[str, Dict[str, torch.Tensor]]:
"""Collect statistics for quantization with memory-efficient batch processing."""
stats = {}
Expand Down
21 changes: 16 additions & 5 deletions quantllm/quant/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
import torch.nn as nn
from typing import Optional, Dict, Any, List, Union
from transformers import PreTrainedModel
from .quantization_engine import BaseQuantizer, QuantizationConfig, QuantizedLinear
from .quantization_engine import move_to_device, BaseQuantizer, QuantizationConfig, QuantizedLinear

class GPTQQuantizer(BaseQuantizer):
"""GPTQ quantization implementation with memory-efficient processing."""

def __init__(
self,
model: PreTrainedModel,
model_or_model_name_or_path: Union[str, PreTrainedModel], # Changed parameter name
bits: int = 4,
group_size: int = 128,
actorder: bool = False,
Expand All @@ -28,7 +28,8 @@ def __init__(
Initializes the GPTQQuantizer.

Args:
model (PreTrainedModel): The model to be quantized.
model_or_model_name_or_path (Union[str, PreTrainedModel]):
The Hugging Face model name/path or a PreTrainedModel instance to be quantized.
bits (int, optional): Number of bits for quantization. Defaults to 4.
group_size (int, optional): Size of the quantization group. Defaults to 128.
actorder (bool, optional): Whether to use activation order for columns. Defaults to False.
Expand All @@ -43,7 +44,7 @@ def __init__(
The device for quantization operations ('cpu', 'cuda', etc.).
Inherited from BaseQuantizer. Defaults to None (auto-detection).
"""
super().__init__(model=model, bits=bits, device=device)
super().__init__(model_or_model_name_or_path=model_or_model_name_or_path, bits=bits, device=device)
self.group_size = group_size
self.actorder = actorder
self.allow_mixed_bits = allow_mixed_bits
Expand Down Expand Up @@ -101,8 +102,18 @@ def quantize(self, calibration_data: Optional[torch.Tensor] = None) -> PreTraine
self._clear_memory()
del self.H[name]

# Update model config with quantization parameters
gptq_specific_params = {
"actorder": self.actorder,
"sym": self.sym,
"percdamp": self.percdamp,
"allow_mixed_bits": self.allow_mixed_bits # Added from __init__
# use_triton is more of a runtime/environment flag, might not be essential in model config
}
self._update_model_config_with_quant_params("gptq", gptq_specific_params)

return self.model

def _compute_hessian(self, layer: nn.Linear, data: torch.Tensor) -> torch.Tensor:
"""Compute Hessian approximation for a layer with memory-efficient processing."""
n = layer.in_features
Expand Down
Loading