1515from pathlib import Path
1616import tempfile
1717import shutil
18+ import glob
1819
1920class SystemResourceMonitor :
2021 """Monitor system resources during quantization."""
@@ -28,11 +29,11 @@ def _get_gpu_info(self) -> Dict[str, Any]:
2829 """Get comprehensive GPU information."""
2930 gpu_info = {"available" : False , "devices" : []}
3031
31- if torch .cuda .is_available ():
32+ if torch .cuda .is_available ():
3233 gpu_info ["available" ] = True
3334 gpu_info ["device_count" ] = torch .cuda .device_count ()
3435
35- for i in range (torch .cuda .device_count ()):
36+ for i in range (torch .cuda .device_count ()):
3637 props = torch .cuda .get_device_properties (i )
3738 total_mem = props .total_memory / (1024 ** 3 )
3839 allocated_mem = torch .cuda .memory_allocated (i ) / (1024 ** 3 )
@@ -154,14 +155,14 @@ def estimate_model_size(model_name: Union[str, PreTrainedModel]) -> Dict[str, fl
154155
155156def _estimate_from_config (config , model_name : str ) -> Dict [str , float ]:
156157 """Estimate model size from configuration."""
157- if hasattr (config , 'num_parameters' ):
158+ if hasattr (config , 'num_parameters' ):
158159 params = config .num_parameters
159- elif hasattr (config , 'n_params' ):
160+ elif hasattr (config , 'n_params' ):
160161 params = config .n_params
161- elif hasattr (config , 'hidden_size' ) and hasattr (config , 'num_hidden_layers' ):
162+ elif hasattr (config , 'hidden_size' ) and hasattr (config , 'num_hidden_layers' ):
162163 # Enhanced estimation for various architectures
163- hidden_size = config .hidden_size
164- num_layers = config .num_hidden_layers
164+ hidden_size = config .hidden_size
165+ num_layers = config .num_hidden_layers
165166 vocab_size = getattr (config , 'vocab_size' , 32000 )
166167
167168 # Architecture-specific calculations
@@ -549,7 +550,7 @@ def quantize_from_pretrained(
549550 raise ValueError (f"Unsupported bits: { bits } . Supported values: { SUPPORTED_GGUF_BITS } " )
550551 if quant_type and quant_type not in SUPPORTED_GGUF_TYPES .get (bits , {}):
551552 raise ValueError (f"Unsupported quant_type: { quant_type } for { bits } bits" )
552-
553+
553554 # Analyze model and resources
554555 if verbose :
555556 progress .update (10 , "Analyzing model and system resources..." )
@@ -578,7 +579,7 @@ def quantize_from_pretrained(
578579
579580 if device is None :
580581 device = optimal_config ["device" ]
581- if device_map == "auto" :
582+ if device_map == "auto" :
582583 device_map = optimal_config ["device_map" ]
583584 if max_memory is None :
584585 max_memory = optimal_config .get ("max_memory" )
@@ -591,7 +592,7 @@ def quantize_from_pretrained(
591592 logger .log_info (f" • Device map: { device_map } " )
592593 logger .log_info (f" • CPU offload: { cpu_offload } " )
593594 logger .log_info (f" • Optimization level: { optimal_config ['optimization_level' ]} " )
594-
595+
595596 # Configure BitsAndBytes for 4-bit quantization
596597 if load_in_4bit :
597598 compute_dtype = bnb_4bit_compute_dtype or torch .float16
@@ -662,17 +663,18 @@ def quantize_from_pretrained(
662663 finally :
663664 if torch .cuda .is_available ():
664665 torch .cuda .empty_cache ()
665-
666+
666667 @staticmethod
667668 def save_quantized_model (
668669 model : PreTrainedModel ,
669670 output_path : str ,
670- save_format : str = "gguf " ,
671+ save_format : str = "safetensors " ,
671672 save_tokenizer : bool = True ,
672673 quant_config : Optional [Dict [str , Any ]] = None ,
673674 safe_serialization : bool = True ,
674675 verbose : bool = False ,
675- progress_callback : Optional [Callable ] = None
676+ progress_callback : Optional [Callable ] = None ,
677+ replace_original : bool = True
676678 ):
677679 """
678680 Save a quantized model in either GGUF or safetensors format.
@@ -686,6 +688,7 @@ def save_quantized_model(
686688 safe_serialization: Whether to use safe serialization
687689 verbose: Whether to show detailed progress
688690 progress_callback: Optional callback for progress updates
691+ replace_original: Whether to replace original model files with quantized ones
689692 """
690693 try :
691694 # Initialize progress tracking
@@ -697,113 +700,121 @@ def save_quantized_model(
697700 if progress_callback :
698701 progress_callback (0 , "Starting model export..." )
699702
703+ # Get original model path from cache if available
704+ original_path = None
705+ if hasattr (model , 'config' ) and hasattr (model .config , '_name_or_path' ):
706+ from transformers .utils import HUGGINGFACE_HUB_CACHE
707+ model_id = model .config ._name_or_path
708+ if '/' in model_id : # It's a hub model
709+ cache_dir = os .getenv ('TRANSFORMERS_CACHE' , HUGGINGFACE_HUB_CACHE )
710+ org , model_name = model_id .split ('/' )
711+ potential_paths = glob .glob (os .path .join (cache_dir , 'models--' + org + '--' + model_name , '*' , 'snapshots' , '*' ))
712+ if potential_paths :
713+ original_path = potential_paths [0 ]
714+ if verbose :
715+ logger .log_info (f"Found original model in cache: { original_path } " )
716+
700717 # Setup output directory
701- output_dir = os .path .abspath (os .path .dirname (output_path ))
702- os .makedirs (output_dir , exist_ok = True )
703-
704- # Get base filename without extension
705- base_name = os .path .splitext (os .path .basename (output_path ))[0 ]
718+ if output_path == "auto" and original_path :
719+ output_path = original_path
720+ else :
721+ output_path = os .path .abspath (output_path )
722+
723+ os .makedirs (os .path .dirname (output_path ), exist_ok = True )
706724
707725 if verbose :
708- logger .log_info ("\n " + "=" * 80 )
709- logger .log_info (f" SAVING MODEL IN { save_format .upper ()} FORMAT " .center (80 , "=" ))
710- logger .log_info ("=" * 80 )
711-
712- # Log model details
713- total_params = sum (p .numel () for p in model .parameters ())
714- model_size_gb = sum (p .numel () * p .element_size () for p in model .parameters ()) / (1024 ** 3 )
715-
716- logger .log_info ("📊 Model Information:" )
717- logger .log_info ("-" * 40 )
718- logger .log_info (f"• Architecture: { model .config .model_type } " )
719- logger .log_info (f"• Total Parameters: { total_params :,} " )
720- logger .log_info (f"• Model Size: { model_size_gb :.2f} GB" )
721- logger .log_info (f"• Export Format: { save_format .upper ()} " )
722- logger .log_info (f"• Output Directory: { output_dir } " )
723- logger .log_info ("" )
724-
725- if save_format .lower () == "gguf" :
726- if verbose :
727- progress .start_phase ("GGUF Conversion" )
728-
729- # Get quantization configuration
730- if not quant_config and hasattr (model .config , 'quantization_config' ):
731- config_dict = model .config .quantization_config
732- if isinstance (config_dict , BitsAndBytesConfig ):
733- bits = 4 if config_dict .load_in_4bit else (8 if config_dict .load_in_8bit else 16 )
734- quant_config = {
735- 'bits' : bits ,
736- 'group_size' : 128 ,
737- 'quant_type' : f"Q{ bits } _K_M" if bits <= 8 else "F16"
738- }
739-
740- # Convert to GGUF
741- from ..quant .llama_cpp_utils import LlamaCppConverter
742- converter = LlamaCppConverter (verbose = verbose )
743-
744- if progress_callback :
745- progress_callback (30 , "Converting to GGUF format..." )
746-
747- # Ensure .gguf extension
748- if not output_path .lower ().endswith ('.gguf' ):
749- output_path = f"{ output_path } .gguf"
750-
751- gguf_path = converter .convert_to_gguf (
752- model = model ,
753- output_dir = output_dir ,
754- bits = quant_config .get ('bits' , 4 ) if quant_config else 4 ,
755- group_size = quant_config .get ('group_size' , 128 ) if quant_config else 128 ,
756- save_tokenizer = save_tokenizer ,
757- custom_name = os .path .basename (output_path ),
758- progress_callback = progress_callback
759- )
760-
761- if verbose :
762- file_size = os .path .getsize (gguf_path ) / (1024 ** 3 )
763- logger .log_info (f"\n ✅ GGUF model saved ({ file_size :.2f} GB): { gguf_path } " )
726+ logger .log_info (f"Saving quantized model to: { output_path } " )
764727
765- else : # safetensors format
766- if verbose :
767- progress .start_phase ("Safetensors Export" )
768- logger .log_info ("\n 💾 Saving model in safetensors format..." )
769-
770- if progress_callback :
771- progress_callback (30 , "Saving in safetensors format..." )
772-
773- # Create a temporary directory for sharded saving
774- with tempfile .TemporaryDirectory (prefix = "model_save_" , dir = output_dir ) as temp_dir :
775- # Save model with sharding for large models
728+ # Create temporary directory for saving
729+ with tempfile .TemporaryDirectory (prefix = "quant_save_" ) as temp_dir :
730+ if save_format .lower () == "gguf" :
731+ if verbose :
732+ progress .start_phase ("GGUF Conversion" )
733+
734+ # Convert to GGUF
735+ from ..quant .llama_cpp_utils import LlamaCppConverter
736+ converter = LlamaCppConverter (verbose = verbose )
737+
738+ if progress_callback :
739+ progress_callback (30 , "Converting to GGUF format..." )
740+
741+ # Ensure .gguf extension
742+ if not output_path .lower ().endswith ('.gguf' ):
743+ output_path = f"{ output_path } .gguf"
744+
745+ gguf_path = converter .convert_to_gguf (
746+ model = model ,
747+ output_dir = os .path .dirname (output_path ),
748+ bits = quant_config .get ('bits' , 4 ) if quant_config else 4 ,
749+ group_size = quant_config .get ('group_size' , 128 ) if quant_config else 128 ,
750+ save_tokenizer = save_tokenizer ,
751+ custom_name = os .path .basename (output_path )
752+ )
753+
754+ if verbose :
755+ file_size = os .path .getsize (gguf_path ) / (1024 ** 3 )
756+ logger .log_info (f"\n ✅ GGUF model saved ({ file_size :.2f} GB): { gguf_path } " )
757+
758+ else : # safetensors format
759+ if verbose :
760+ progress .start_phase ("Safetensors Export" )
761+
762+ if progress_callback :
763+ progress_callback (30 , "Saving in safetensors format..." )
764+
765+ # Save to temporary directory first
776766 model .save_pretrained (
777767 temp_dir ,
778768 safe_serialization = safe_serialization ,
779769 max_shard_size = "2GB"
780770 )
781771
782- if progress_callback :
783- progress_callback ( 60 , "Moving files to final location..." )
772+ if save_tokenizer and hasattr ( model , 'tokenizer' ) :
773+ model . tokenizer . save_pretrained ( temp_dir )
784774
785- # Move files to final location
786- for file in os .listdir (temp_dir ):
787- src = os .path .join (temp_dir , file )
788- dst = os .path .join (output_dir , file )
789- if os .path .exists (dst ):
790- os .remove (dst )
791- shutil .move (src , dst )
775+ # If replacing original files in cache
776+ if replace_original and original_path :
777+ target_dir = original_path
778+ if verbose :
779+ logger .log_info (f"Replacing original files in: { target_dir } " )
780+
781+ # Remove old model files but keep config and tokenizer
782+ for file in os .listdir (target_dir ):
783+ if file .endswith (('.bin' , '.safetensors' , '.pt' , '.gguf' )):
784+ os .remove (os .path .join (target_dir , file ))
785+
786+ # Copy new files
787+ for file in os .listdir (temp_dir ):
788+ src = os .path .join (temp_dir , file )
789+ dst = os .path .join (target_dir , file )
790+ if os .path .exists (dst ):
791+ os .remove (dst )
792+ shutil .copy2 (src , dst )
793+
794+ if verbose :
795+ logger .log_info ("✅ Original model files replaced with quantized versions" )
792796
793- # Save tokenizer if requested
794- if save_tokenizer and hasattr (model , 'tokenizer' ):
795- if progress_callback :
796- progress_callback (80 , "Saving tokenizer..." )
797- model .tokenizer .save_pretrained (output_dir )
798-
799- if verbose :
800- total_size = sum (
801- os .path .getsize (os .path .join (output_dir , f )) / (1024 ** 3 )
802- for f in os .listdir (output_dir )
803- if f .endswith ('.safetensors' )
804- )
805- logger .log_info (f"\n ✅ Model saved in safetensors format ({ total_size :.2f} GB)" )
806- logger .log_info (f"📁 Output directory: { output_dir } " )
797+ # If saving to custom location
798+ else :
799+ target_dir = output_path if os .path .isdir (output_path ) else os .path .dirname (output_path )
800+ os .makedirs (target_dir , exist_ok = True )
801+
802+ # Copy files to final location
803+ for file in os .listdir (temp_dir ):
804+ src = os .path .join (temp_dir , file )
805+ dst = os .path .join (target_dir , file )
806+ if os .path .exists (dst ):
807+ os .remove (dst )
808+ shutil .copy2 (src , dst )
809+
810+ if verbose :
811+ total_size = sum (
812+ os .path .getsize (os .path .join (target_dir , f )) / (1024 ** 3 )
813+ for f in os .listdir (target_dir )
814+ if f .endswith ('.safetensors' )
815+ )
816+ logger .log_info (f"\n ✅ Model saved in safetensors format ({ total_size :.2f} GB)" )
817+ logger .log_info (f"📁 Output directory: { target_dir } " )
807818
808819 if verbose :
809820 progress .end_phase ()
@@ -818,3 +829,4 @@ def save_quantized_model(
818829 finally :
819830 if torch .cuda .is_available ():
820831 torch .cuda .empty_cache ()
832+
0 commit comments