Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,32 +61,30 @@ Below is the sample code to enable weight-only INT4/INT8 inference. See more [ex
### INT4 Inference
```python
from transformers import AutoTokenizer, TextStreamer
from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
from intel_extension_for_transformers.transformers import AutoModelForCausalLM
model_name = "Intel/neural-chat-7b-v1-1" # Hugging Face model_id or local model
config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
prompt = "Once upon a time, there existed a little girl,"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
streamer = TextStreamer(tokenizer)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=config)
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```

### INT8 Inference
```python
from transformers import AutoTokenizer, TextStreamer
from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
from intel_extension_for_transformers.transformers import AutoModelForCausalLM
model_name = "Intel/neural-chat-7b-v1-1" # Hugging Face model_id or local model
config = WeightOnlyQuantConfig(compute_dtype="bf16", weight_dtype="int8")
prompt = "Once upon a time, there existed a little girl,"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
streamer = TextStreamer(tokenizer)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=config)
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```

Expand Down
5 changes: 2 additions & 3 deletions intel_extension_for_transformers/llm/runtime/graph/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,15 @@ pip install intel-extension-for-transformers
You can use Python API to run Hugging Face model simply. Here is the sample code:
```python
from transformers import AutoTokenizer, TextStreamer
from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
from intel_extension_for_transformers.transformers import AutoModelForCausalLM
model_name = "Intel/neural-chat-7b-v1-1" # Hugging Face model_id or local model
config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
prompt = "Once upon a time, there existed a little girl,"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
streamer = TextStreamer(tokenizer)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=config)
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
if load_in_4bit:
if quantization_config is None:
quantization_config = WeightOnlyQuantConfig(
compute_dtype=torch_dtype, weight_dtype="nf4"
)
if use_llm_runtime:
quantization_config = WeightOnlyQuantConfig(
compute_dtype="int8", weight_dtype="int4"
)
else:
quantization_config = WeightOnlyQuantConfig(
compute_dtype=torch_dtype, weight_dtype="nf4"
)
else:
assert (
"4" in quantization_config.weight_dtype
Expand All @@ -112,9 +117,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
f"'fp4_e2m1' or 'fp4_e2m1_bnb' and compute_dtype should be {torch_dtype}."
elif load_in_8bit:
if quantization_config is None:
quantization_config = WeightOnlyQuantConfig(
compute_dtype=torch_dtype, weight_dtype="int8"
)
if use_llm_runtime:
quantization_config = WeightOnlyQuantConfig(
compute_dtype="bf16", weight_dtype="int8"
)
else:
quantization_config = WeightOnlyQuantConfig(
compute_dtype=torch_dtype, weight_dtype="int8"
)
else:
assert (
quantization_config.weight_dtype == "int8"
Expand Down