intel
diff --git a/‎intel_extension_for_transformers/llm/quantization/optimization.py‎
Lines changed: 20 additions & 15 deletions b/‎intel_extension_for_transformers/llm/quantization/optimization.py‎
Lines changed: 20 additions & 15 deletions
diff --git a/‎intel_extension_for_transformers/neural_chat/models/model_utils.py‎
Lines changed: 9 additions & 9 deletions b/‎intel_extension_for_transformers/neural_chat/models/model_utils.py‎
Lines changed: 9 additions & 9 deletions
@@ -25,7 +25,11 @@ def __init__(
  self.optimization_config = optimization_config
 
  def optimize(self, model, use_llm_runtime=False):
- optimized_model = model
+ if isinstance(model, str):
+ model_name = model
+ else:
+ model_name = model.config._name_or_path
+ optimized_model = model
  from intel_extension_for_transformers.transformers import (
  MixedPrecisionConfig,
  WeightOnlyQuantConfig,
@@ -35,39 +39,40 @@ def optimize(self, model, use_llm_runtime=False):
  f"Expect optimization_config be an object of MixedPrecisionConfig, WeightOnlyQuantConfig" + \
  " or BitsAndBytesConfig,got {type(self.optimization_config)}."
  config = self.optimization_config
- if re.search("flan-t5", model.config._name_or_path, re.IGNORECASE):
+ if re.search("flan-t5", model_name, re.IGNORECASE):
  from intel_extension_for_transformers.transformers import AutoModelForSeq2SeqLM
  optimized_model = AutoModelForSeq2SeqLM.from_pretrained(
- model.config._name_or_path,
+ model_name,
  quantization_config=config,
  use_llm_runtime=use_llm_runtime,
  trust_remote_code=True)
  elif (
- re.search("gpt", model.config._name_or_path, re.IGNORECASE)
- or re.search("mpt", model.config._name_or_path, re.IGNORECASE)
- or re.search("bloom", model.config._name_or_path, re.IGNORECASE)
- or re.search("llama", model.config._name_or_path, re.IGNORECASE)
- or re.search("opt", model.config._name_or_path, re.IGNORECASE)
- or re.search("neural-chat-7b-v1", model.config._name_or_path, re.IGNORECASE)
- or re.search("neural-chat-7b-v2", model.config._name_or_path, re.IGNORECASE)
+ re.search("gpt", model_name, re.IGNORECASE)
+ or re.search("mpt", model_name, re.IGNORECASE)
+ or re.search("bloom", model_name, re.IGNORECASE)
+ or re.search("llama", model_name, re.IGNORECASE)
+ or re.search("opt", model_name, re.IGNORECASE)
+ or re.search("neural-chat-7b-v1", model_name, re.IGNORECASE)
+ or re.search("neural-chat-7b-v2", model_name, re.IGNORECASE)
+ or re.search("neural-chat-7b-v3", model_name, re.IGNORECASE)
  ):
  from intel_extension_for_transformers.transformers import AutoModelForCausalLM
  optimized_model = AutoModelForCausalLM.from_pretrained(
- model.config._name_or_path,
+ model_name,
  quantization_config=config,
  use_llm_runtime=use_llm_runtime,
  trust_remote_code=True)
- elif re.search("starcoder", model.config._name_or_path, re.IGNORECASE):
+ elif re.search("starcoder", model_name, re.IGNORECASE):
  from intel_extension_for_transformers.transformers import GPTBigCodeForCausalLM
  optimized_model = GPTBigCodeForCausalLM.from_pretrained(
- model.config._name_or_path,
+ model_name,
  quantization_config=config,
  use_llm_runtime=use_llm_runtime,
  trust_remote_code=True)
- elif re.search("chatglm", model.config._name_or_path, re.IGNORECASE):
+ elif re.search("chatglm", model_name, re.IGNORECASE):
  from intel_extension_for_transformers.transformers import AutoModel
  optimized_model = AutoModel.from_pretrained(
- model.config._name_or_path,
+ model_name,
  quantization_config=config,
  use_llm_runtime=use_llm_runtime,
  trust_remote_code=True)
 
@@ -332,6 +332,15 @@ def load_model(
  config = AutoConfig.from_pretrained(model_name, use_auth_token=hf_access_token, trust_remote_code=True \
  if re.search("chatglm", model_name, re.IGNORECASE) else False)
  load_to_meta = model_on_meta(config)
+
+ if isinstance(optimization_config, WeightOnlyQuantConfig):
+ from intel_extension_for_transformers.neural_chat.chatbot import optimize_model
+ model = optimize_model(model_name, optimization_config, use_llm_runtime)
+ MODELS[model_name]["model"] = model
+ MODELS[model_name]["tokenizer"] = tokenizer
+ print("Optimized Model loaded.")
+ return
+
  if peft_path and device == "hpu" and use_deepspeed and load_to_meta:
  print("PEFT could not work in deepspeed sharded checkpt loading mode, set load_to_meta to False")
  load_to_meta = False
@@ -426,15 +435,6 @@ def load_model(
  if model.generation_config.eos_token_id is None:
  model.generation_config.eos_token_id = tokenizer.eos_token_id
 
- if isinstance(optimization_config, WeightOnlyQuantConfig):
- from intel_extension_for_transformers.neural_chat.chatbot import optimize_model
- model = optimize_model(model, optimization_config, use_llm_runtime)
-
- MODELS[model_name]["model"] = model
- MODELS[model_name]["tokenizer"] = tokenizer
- print("Optimized Model loaded.")
- return
-
  if device == "hpu":
  if peft_path:
  from peft import PeftModel