Skip to content

Commit 5b327ca

Browse files
Fix the PushModel and APi.
1 parent cdfee06 commit 5b327ca

File tree

2 files changed

+146
-108
lines changed

2 files changed

+146
-108
lines changed

quantllm/api/high_level.py

Lines changed: 120 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from pathlib import Path
1616
import tempfile
1717
import shutil
18+
import glob
1819

1920
class SystemResourceMonitor:
2021
"""Monitor system resources during quantization."""
@@ -28,11 +29,11 @@ def _get_gpu_info(self) -> Dict[str, Any]:
2829
"""Get comprehensive GPU information."""
2930
gpu_info = {"available": False, "devices": []}
3031

31-
if torch.cuda.is_available():
32+
if torch.cuda.is_available():
3233
gpu_info["available"] = True
3334
gpu_info["device_count"] = torch.cuda.device_count()
3435

35-
for i in range(torch.cuda.device_count()):
36+
for i in range(torch.cuda.device_count()):
3637
props = torch.cuda.get_device_properties(i)
3738
total_mem = props.total_memory / (1024**3)
3839
allocated_mem = torch.cuda.memory_allocated(i) / (1024**3)
@@ -154,14 +155,14 @@ def estimate_model_size(model_name: Union[str, PreTrainedModel]) -> Dict[str, fl
154155

155156
def _estimate_from_config(config, model_name: str) -> Dict[str, float]:
156157
"""Estimate model size from configuration."""
157-
if hasattr(config, 'num_parameters'):
158+
if hasattr(config, 'num_parameters'):
158159
params = config.num_parameters
159-
elif hasattr(config, 'n_params'):
160+
elif hasattr(config, 'n_params'):
160161
params = config.n_params
161-
elif hasattr(config, 'hidden_size') and hasattr(config, 'num_hidden_layers'):
162+
elif hasattr(config, 'hidden_size') and hasattr(config, 'num_hidden_layers'):
162163
# Enhanced estimation for various architectures
163-
hidden_size = config.hidden_size
164-
num_layers = config.num_hidden_layers
164+
hidden_size = config.hidden_size
165+
num_layers = config.num_hidden_layers
165166
vocab_size = getattr(config, 'vocab_size', 32000)
166167

167168
# Architecture-specific calculations
@@ -549,7 +550,7 @@ def quantize_from_pretrained(
549550
raise ValueError(f"Unsupported bits: {bits}. Supported values: {SUPPORTED_GGUF_BITS}")
550551
if quant_type and quant_type not in SUPPORTED_GGUF_TYPES.get(bits, {}):
551552
raise ValueError(f"Unsupported quant_type: {quant_type} for {bits} bits")
552-
553+
553554
# Analyze model and resources
554555
if verbose:
555556
progress.update(10, "Analyzing model and system resources...")
@@ -578,7 +579,7 @@ def quantize_from_pretrained(
578579

579580
if device is None:
580581
device = optimal_config["device"]
581-
if device_map == "auto":
582+
if device_map == "auto":
582583
device_map = optimal_config["device_map"]
583584
if max_memory is None:
584585
max_memory = optimal_config.get("max_memory")
@@ -591,7 +592,7 @@ def quantize_from_pretrained(
591592
logger.log_info(f" • Device map: {device_map}")
592593
logger.log_info(f" • CPU offload: {cpu_offload}")
593594
logger.log_info(f" • Optimization level: {optimal_config['optimization_level']}")
594-
595+
595596
# Configure BitsAndBytes for 4-bit quantization
596597
if load_in_4bit:
597598
compute_dtype = bnb_4bit_compute_dtype or torch.float16
@@ -662,17 +663,18 @@ def quantize_from_pretrained(
662663
finally:
663664
if torch.cuda.is_available():
664665
torch.cuda.empty_cache()
665-
666+
666667
@staticmethod
667668
def save_quantized_model(
668669
model: PreTrainedModel,
669670
output_path: str,
670-
save_format: str = "gguf",
671+
save_format: str = "safetensors",
671672
save_tokenizer: bool = True,
672673
quant_config: Optional[Dict[str, Any]] = None,
673674
safe_serialization: bool = True,
674675
verbose: bool = False,
675-
progress_callback: Optional[Callable] = None
676+
progress_callback: Optional[Callable] = None,
677+
replace_original: bool = True
676678
):
677679
"""
678680
Save a quantized model in either GGUF or safetensors format.
@@ -686,6 +688,7 @@ def save_quantized_model(
686688
safe_serialization: Whether to use safe serialization
687689
verbose: Whether to show detailed progress
688690
progress_callback: Optional callback for progress updates
691+
replace_original: Whether to replace original model files with quantized ones
689692
"""
690693
try:
691694
# Initialize progress tracking
@@ -697,113 +700,121 @@ def save_quantized_model(
697700
if progress_callback:
698701
progress_callback(0, "Starting model export...")
699702

703+
# Get original model path from cache if available
704+
original_path = None
705+
if hasattr(model, 'config') and hasattr(model.config, '_name_or_path'):
706+
from transformers.utils import HUGGINGFACE_HUB_CACHE
707+
model_id = model.config._name_or_path
708+
if '/' in model_id: # It's a hub model
709+
cache_dir = os.getenv('TRANSFORMERS_CACHE', HUGGINGFACE_HUB_CACHE)
710+
org, model_name = model_id.split('/')
711+
potential_paths = glob.glob(os.path.join(cache_dir, 'models--' + org + '--' + model_name, '*', 'snapshots', '*'))
712+
if potential_paths:
713+
original_path = potential_paths[0]
714+
if verbose:
715+
logger.log_info(f"Found original model in cache: {original_path}")
716+
700717
# Setup output directory
701-
output_dir = os.path.abspath(os.path.dirname(output_path))
702-
os.makedirs(output_dir, exist_ok=True)
703-
704-
# Get base filename without extension
705-
base_name = os.path.splitext(os.path.basename(output_path))[0]
718+
if output_path == "auto" and original_path:
719+
output_path = original_path
720+
else:
721+
output_path = os.path.abspath(output_path)
722+
723+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
706724

707725
if verbose:
708-
logger.log_info("\n" + "="*80)
709-
logger.log_info(f" SAVING MODEL IN {save_format.upper()} FORMAT ".center(80, "="))
710-
logger.log_info("="*80)
711-
712-
# Log model details
713-
total_params = sum(p.numel() for p in model.parameters())
714-
model_size_gb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024**3)
715-
716-
logger.log_info("📊 Model Information:")
717-
logger.log_info("-"*40)
718-
logger.log_info(f"• Architecture: {model.config.model_type}")
719-
logger.log_info(f"• Total Parameters: {total_params:,}")
720-
logger.log_info(f"• Model Size: {model_size_gb:.2f} GB")
721-
logger.log_info(f"• Export Format: {save_format.upper()}")
722-
logger.log_info(f"• Output Directory: {output_dir}")
723-
logger.log_info("")
724-
725-
if save_format.lower() == "gguf":
726-
if verbose:
727-
progress.start_phase("GGUF Conversion")
728-
729-
# Get quantization configuration
730-
if not quant_config and hasattr(model.config, 'quantization_config'):
731-
config_dict = model.config.quantization_config
732-
if isinstance(config_dict, BitsAndBytesConfig):
733-
bits = 4 if config_dict.load_in_4bit else (8 if config_dict.load_in_8bit else 16)
734-
quant_config = {
735-
'bits': bits,
736-
'group_size': 128,
737-
'quant_type': f"Q{bits}_K_M" if bits <= 8 else "F16"
738-
}
739-
740-
# Convert to GGUF
741-
from ..quant.llama_cpp_utils import LlamaCppConverter
742-
converter = LlamaCppConverter(verbose=verbose)
743-
744-
if progress_callback:
745-
progress_callback(30, "Converting to GGUF format...")
746-
747-
# Ensure .gguf extension
748-
if not output_path.lower().endswith('.gguf'):
749-
output_path = f"{output_path}.gguf"
750-
751-
gguf_path = converter.convert_to_gguf(
752-
model=model,
753-
output_dir=output_dir,
754-
bits=quant_config.get('bits', 4) if quant_config else 4,
755-
group_size=quant_config.get('group_size', 128) if quant_config else 128,
756-
save_tokenizer=save_tokenizer,
757-
custom_name=os.path.basename(output_path),
758-
progress_callback=progress_callback
759-
)
760-
761-
if verbose:
762-
file_size = os.path.getsize(gguf_path) / (1024**3)
763-
logger.log_info(f"\n✅ GGUF model saved ({file_size:.2f} GB): {gguf_path}")
726+
logger.log_info(f"Saving quantized model to: {output_path}")
764727

765-
else: # safetensors format
766-
if verbose:
767-
progress.start_phase("Safetensors Export")
768-
logger.log_info("\n💾 Saving model in safetensors format...")
769-
770-
if progress_callback:
771-
progress_callback(30, "Saving in safetensors format...")
772-
773-
# Create a temporary directory for sharded saving
774-
with tempfile.TemporaryDirectory(prefix="model_save_", dir=output_dir) as temp_dir:
775-
# Save model with sharding for large models
728+
# Create temporary directory for saving
729+
with tempfile.TemporaryDirectory(prefix="quant_save_") as temp_dir:
730+
if save_format.lower() == "gguf":
731+
if verbose:
732+
progress.start_phase("GGUF Conversion")
733+
734+
# Convert to GGUF
735+
from ..quant.llama_cpp_utils import LlamaCppConverter
736+
converter = LlamaCppConverter(verbose=verbose)
737+
738+
if progress_callback:
739+
progress_callback(30, "Converting to GGUF format...")
740+
741+
# Ensure .gguf extension
742+
if not output_path.lower().endswith('.gguf'):
743+
output_path = f"{output_path}.gguf"
744+
745+
gguf_path = converter.convert_to_gguf(
746+
model=model,
747+
output_dir=os.path.dirname(output_path),
748+
bits=quant_config.get('bits', 4) if quant_config else 4,
749+
group_size=quant_config.get('group_size', 128) if quant_config else 128,
750+
save_tokenizer=save_tokenizer,
751+
custom_name=os.path.basename(output_path)
752+
)
753+
754+
if verbose:
755+
file_size = os.path.getsize(gguf_path) / (1024**3)
756+
logger.log_info(f"\n✅ GGUF model saved ({file_size:.2f} GB): {gguf_path}")
757+
758+
else: # safetensors format
759+
if verbose:
760+
progress.start_phase("Safetensors Export")
761+
762+
if progress_callback:
763+
progress_callback(30, "Saving in safetensors format...")
764+
765+
# Save to temporary directory first
776766
model.save_pretrained(
777767
temp_dir,
778768
safe_serialization=safe_serialization,
779769
max_shard_size="2GB"
780770
)
781771

782-
if progress_callback:
783-
progress_callback(60, "Moving files to final location...")
772+
if save_tokenizer and hasattr(model, 'tokenizer'):
773+
model.tokenizer.save_pretrained(temp_dir)
784774

785-
# Move files to final location
786-
for file in os.listdir(temp_dir):
787-
src = os.path.join(temp_dir, file)
788-
dst = os.path.join(output_dir, file)
789-
if os.path.exists(dst):
790-
os.remove(dst)
791-
shutil.move(src, dst)
775+
# If replacing original files in cache
776+
if replace_original and original_path:
777+
target_dir = original_path
778+
if verbose:
779+
logger.log_info(f"Replacing original files in: {target_dir}")
780+
781+
# Remove old model files but keep config and tokenizer
782+
for file in os.listdir(target_dir):
783+
if file.endswith(('.bin', '.safetensors', '.pt', '.gguf')):
784+
os.remove(os.path.join(target_dir, file))
785+
786+
# Copy new files
787+
for file in os.listdir(temp_dir):
788+
src = os.path.join(temp_dir, file)
789+
dst = os.path.join(target_dir, file)
790+
if os.path.exists(dst):
791+
os.remove(dst)
792+
shutil.copy2(src, dst)
793+
794+
if verbose:
795+
logger.log_info("✅ Original model files replaced with quantized versions")
792796

793-
# Save tokenizer if requested
794-
if save_tokenizer and hasattr(model, 'tokenizer'):
795-
if progress_callback:
796-
progress_callback(80, "Saving tokenizer...")
797-
model.tokenizer.save_pretrained(output_dir)
798-
799-
if verbose:
800-
total_size = sum(
801-
os.path.getsize(os.path.join(output_dir, f)) / (1024**3)
802-
for f in os.listdir(output_dir)
803-
if f.endswith('.safetensors')
804-
)
805-
logger.log_info(f"\n✅ Model saved in safetensors format ({total_size:.2f} GB)")
806-
logger.log_info(f"📁 Output directory: {output_dir}")
797+
# If saving to custom location
798+
else:
799+
target_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
800+
os.makedirs(target_dir, exist_ok=True)
801+
802+
# Copy files to final location
803+
for file in os.listdir(temp_dir):
804+
src = os.path.join(temp_dir, file)
805+
dst = os.path.join(target_dir, file)
806+
if os.path.exists(dst):
807+
os.remove(dst)
808+
shutil.copy2(src, dst)
809+
810+
if verbose:
811+
total_size = sum(
812+
os.path.getsize(os.path.join(target_dir, f)) / (1024**3)
813+
for f in os.listdir(target_dir)
814+
if f.endswith('.safetensors')
815+
)
816+
logger.log_info(f"\n✅ Model saved in safetensors format ({total_size:.2f} GB)")
817+
logger.log_info(f"📁 Output directory: {target_dir}")
807818

808819
if verbose:
809820
progress.end_phase()
@@ -818,3 +829,4 @@ def save_quantized_model(
818829
finally:
819830
if torch.cuda.is_available():
820831
torch.cuda.empty_cache()
832+

quantllm/hub/hub_manager.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def __init__(
1414
self.model_id = model_id
1515
self.organization = organization
1616
self.token = token
17+
1718

1819
def login(self):
1920
"""Login to Hugging Face Hub."""
@@ -62,6 +63,31 @@ def push_model(
6263
except Exception as e:
6364
logger.log_error(f"Error pushing to hub: {str(e)}")
6465
raise
66+
67+
def push_folder(
68+
self,
69+
folder_path: str,
70+
commit_message: str = "Update model",
71+
allow_patterns: Optional[list] = None,
72+
ignore_patterns: Optional[list] = None,
73+
**kwargs
74+
):
75+
"""Push all files from a folder to the HuggingFace Hub."""
76+
try:
77+
78+
self.api.upload_folder(
79+
folder_path=folder_path,
80+
repo_id=self.model_id,
81+
token=self.token,
82+
commit_message=commit_message,
83+
allow_patterns=allow_patterns,
84+
ignore_patterns=ignore_patterns,
85+
**kwargs
86+
)
87+
logger.log_success(f"Successfully pushed folder to {self.model_id}")
88+
except Exception as e:
89+
logger.log_error(f"Error pushing folder: {str(e)}")
90+
raise
6591

6692
def push_checkpoint(
6793
self,

0 commit comments

Comments
 (0)