Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 29bbd80

Browse files
[NeuralChat] Support LLM runtime ggml int4 (#1098)
* Support llm runtime ggml int4 Signed-off-by: lvliang-intel <liang1.lv@intel.com>
1 parent e6ecb21 commit 29bbd80

File tree

2 files changed

+3
-2
lines changed

2 files changed

+3
-2
lines changed

intel_extension_for_transformers/neural_chat/models/model_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -911,7 +911,7 @@ def generate_output():
911911
max_new_tokens=max_new_tokens,
912912
ctx_size=max_new_tokens,
913913
ignore_prompt=True,
914-
interactive=True,
914+
interactive=False if "magicoder" in model_name.lower() else True,
915915
do_sample=do_sample,
916916
num_beams=num_beams,
917917
n_keep=2 if "chatglm" in model_name.lower() else 1

intel_extension_for_transformers/neural_chat/server/neuralchat_server.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ def init(self, config):
161161
compute_dtype = yaml_config.get("compute_dtype", {})
162162
weight_dtype = yaml_config.get("weight_dtype", {})
163163
use_cached_bin = yaml_config.get("use_cached_bin", {})
164+
use_ggml = yaml_config.get("use_ggml", False)
164165
mix_precision_dtype = yaml_config.get("mix_precision_dtype", {})
165166
load_in_4bit = yaml_config.get("load_in_4bit", {})
166167
bnb_4bit_quant_type = yaml_config.get("bnb_4bit_quant_type", {})
@@ -172,7 +173,7 @@ def init(self, config):
172173
from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig, MixedPrecisionConfig
173174
if optimization_type == "weight_only":
174175
optimization_config = WeightOnlyQuantConfig(compute_dtype=compute_dtype, weight_dtype=weight_dtype,
175-
use_cache=use_cached_bin)
176+
use_ggml=use_ggml, use_cache=use_cached_bin)
176177
elif optimization_type == "mix_precision":
177178
optimization_config = MixedPrecisionConfig(dtype=mix_precision_dtype)
178179
elif optimization_type == "bits_and_bytes":

0 commit comments

Comments
 (0)