intel
diff --git a/‎.github/workflows/script/models/cpp_graph_inference.sh‎
Lines changed: 9 additions & 1 deletion b/‎.github/workflows/script/models/cpp_graph_inference.sh‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/README.md‎
Lines changed: 4 additions & 4 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/__init__.py‎
Lines changed: 0 additions & 4 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/__init__.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/application/common.cpp‎
Lines changed: 6 additions & 0 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/application/common.cpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/application/common.h‎
Lines changed: 5 additions & 0 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/application/common.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp‎
Lines changed: 6 additions & 3 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/models/model_utils/quant_config.h‎
Lines changed: 8 additions & 5 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/models/model_utils/quant_config.h‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/scripts/python_api_example.py‎
Lines changed: 7 additions & 1 deletion b/‎intel_extension_for_transformers/llm/runtime/graph/scripts/python_api_example.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/scripts/quantize.py‎
Lines changed: 33 additions & 5 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/scripts/quantize.py‎
Lines changed: 33 additions & 5 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/scripts/run.py‎
Lines changed: 12 additions & 2 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/scripts/run.py‎
Lines changed: 12 additions & 2 deletions
@@ -25,7 +25,7 @@ function main() {
  quant_script="./build/bin/quant_llama"
  infer_cmd="./build/bin/run_llama"
  input_model="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
- precision_list=("q4_j_b128" "q4_j_b32" "q4_0")
+ precision_list=("q4_j_b128" "q4_j_b32" "q4_0" "q8e4m3_j_f32_g128_fp8" "q8e5m2_j_f32_g128_fp8" "q4e2m1_j_f32_g128" "nf4_j_f32_g128")
  elif [[ "${model}" == "gpt-neox-20b" ]]; then
  convert_script="${working_dir}/scripts/convert_gptneox.py"
  quant_script="./build/bin/quant_gptneox"
@@ -120,6 +120,14 @@ function main() {
  # deprecated since bfloat16 scale not mature 
  # elif [[ ${precision} == "q4_j_vnni_bf16_b32" ]]; then
  # ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int4 --group_size 32 --scale_dtype bf16 --compute_dtype int8 --alg sym
+ elif [[ ${precision} == "q8e4m3_j_f32_g128_fp8" ]]; then
+ ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype fp8 --group_size 128 --scale_dtype fp8 --compute_dtype fp32 --alg sym
+ elif [[ ${precision} == "q8e5m2_j_f32_g128_fp8" ]]; then
+ ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype fp8_e5m2 --group_size 128 --scale_dtype fp8 --compute_dtype fp32 --alg sym
+ elif [[ ${precision} == "q4e2m1_j_f32_g128" ]]; then
+ ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype fp4 --group_size 128 --scale_dtype fp32 --compute_dtype fp32 --alg sym
+ elif [[ ${precision} == "nf4_j_f32_g128" ]]; then
+ ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype nf4 --group_size 128 --scale_dtype fp32 --compute_dtype fp32 --alg sym
  elif [[ ${precision} == "q4_j_vnni_b32" ]]; then
  ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int4 --group_size 32 --scale_dtype fp32 --compute_dtype int8 --alg sym
  elif [[ ${precision} == "q4_j_b32" ]]; then
 
@@ -259,11 +259,11 @@ Argument description of run.py:
 | Argument | Description |
 | -------------- | --------------------------------------------------------------------- |
 | model | Directory containing model file or model id: String |
-| --weight_dtype | Data type of quantized weight: int4/int8 (default int4) |
+| --weight_dtype | Data type of quantized weight: int4/int8/fp8(=fp8_e4m3)/fp8_e5m2/fp4(=fp4e2m1)/nf4 (default int4) |
 | --alg | Quantization algorithm: sym/asym (default sym) |
-| --group_size | Group size: Int (default: 32) |
-| --scale_dtype | Data type of scales: fp32/bf16 (dafault fp32) |
-| --compute_dtype | Data type of Gemm computation: int8/bf16/fp32 (default: int8) |
+| --group_size | Group size: Int, 32/128/-1 (per channel) (default: 32) |
+| --scale_dtype | Data type of scales: fp32/bf16/fp8 (dafault fp32) |
+| --compute_dtype | Data type of Gemm computation: int8/bf16/fp16/fp32 (default: int8) |
 | --use_ggml | Enable ggml for quantization and inference |
 | -p / --prompt | Prompt to start generation with: String (default: empty) |
 | -n / --n_predict | Number of tokens to predict: Int (default: -1, -1 = infinity) |
 
@@ -82,10 +82,6 @@ def init(self, model_name, use_quant=True, use_gptq=False, **quant_kwargs):
  self.__import_package(self.model_type)
 
  # check cache and quantization
- if use_quant:
- if quant_kwargs['weight_dtype'] == "int8" and quant_kwargs['compute_dtype'] == "bf16":
- raise ValueError("Error: This combination (weight_dtype=int8, compute_dtype=bf16)"
- " is not currently supported. Please use other combinations.")
  output_path = "runtime_outs"
  os.makedirs(output_path, exist_ok=True)
  fp32_bin = "{}/ne_{}_f32.bin".format(output_path, self.model_type)
 
@@ -673,6 +673,12 @@ bool quant_params_parse(int argc, char** argv, quant_params& params) { // NOLIN
  params.nthread = std::stoi(argv[++i]);
  } else if (arg == "--weight_dtype") {
  params.weight_dtype = argv[++i];
+ if (params.weight_dtype == "fp8") {
+ params.weight_dtype = "fp8_e4m3";
+ }
+ if (params.weight_dtype == "fp4") {
+ params.weight_dtype = "fp4_e2m1";
+ }
  } else if (arg == "--alg") {
  params.alg = argv[++i];
  } else if (arg == "--group_size") {
 
@@ -138,10 +138,15 @@ struct quant_params {
  std::string config = "";
  int nthread = 1;
 
+ // [int4, int8, fp8_e5m2, fp8_e4m3, fp4_e2m1, nf4]
  std::string weight_dtype = "int4";
+ // [sym, asym]
  std::string alg = "sym";
+ // [-1, 32, 128]
  int32_t group_size = 32;
+ // [fp32, bf16, fp8]
  std::string scale_dtype = "fp32";
+ // [fp32, fp16, bf16, int8]
  std::string compute_dtype = "int8";
  std::string model_name = "unknown";
  bool use_ggml = false;
 
@@ -880,15 +880,18 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
  if (params.bits == quant_bits::q8) {
  quant_type = JBLAS_DTYPE::S8;
  }
- if (params.bits == quant_bits::fp4) {
+ if (params.bits == quant_bits::fp4_e2m1) {
  quant_type = JBLAS_DTYPE::F4_E2M1;
  }
  if (params.bits == quant_bits::nf4) {
  quant_type = JBLAS_DTYPE::F4_NF4;
  }
- if (params.bits == quant_bits::fp8) {
+ if (params.bits == quant_bits::fp8_e4m3) {
  quant_type = JBLAS_DTYPE::F8_E4M3;
  }
+ if (params.bits == quant_bits::fp8_e5m2) {
+ quant_type = JBLAS_DTYPE::F8_E5M2;
+ }
  auto dtype_type = static_cast<JBLAS_DTYPE>(
  jblas::utils::jblas_dtype_get_mask_val(quant_type, JBLAS_DTYPE::TypeMask, JBLAS_DTYPE::TypeShift));
  if (dtype_type == JBLAS_DTYPE::TypeFloat) {
@@ -906,7 +909,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
  if (params.scale_dtype == quant_sdtype::fp16) {
  printf("Current not support float16 scale, reset to bf16\n");
  }
- if (quant_type == JBLAS_DTYPE::F8_E4M3) {
+ if (quant_type == JBLAS_DTYPE::F8_E4M3 || quant_type == JBLAS_DTYPE::F8_E5M2) {
  if (params.scale_dtype != quant_sdtype::fp8) {
  printf("Warning: fp8 weight only supports fp8 scale now\n");
  }
 
@@ -18,22 +18,25 @@
 #include "core/data_types.h"
 #include "jblas/jit_blas.h"
 
-enum class quant_bits : int { q4 = 0, q8, fp4, nf4, fp8, count };
+enum class quant_bits : int { q4 = 0, q8, fp4_e2m1, nf4, fp8_e4m3, fp8_e5m2, count };
 static inline quant_bits parse_bits(const std::string& bits) {
  if (bits == "int4") {
  return quant_bits::q4;
  }
  if (bits == "int8") {
  return quant_bits::q8;
  }
- if (bits == "fp4") {
- return quant_bits::fp4;
+ if (bits == "fp4_e2m1" || bits == "fp4") {
+ return quant_bits::fp4_e2m1;
  }
  if (bits == "nf4") {
  return quant_bits::nf4;
  }
- if (bits == "fp8") {
- return quant_bits::fp8;
+ if (bits == "fp8_e4m3" || bits == "fp8") {
+ return quant_bits::fp8_e4m3;
+ }
+ if (bits == "fp8_e5m2") {
+ return quant_bits::fp8_e5m2;
  }
  return quant_bits::count;
 }
 
@@ -19,17 +19,23 @@
 from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
 
 model_name = "Intel/neural-chat-7b-v1-1" # or local path to model
+# int4 weight_only quantization
 woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
+# nf4 weight_only quantization
+# woq_config = WeightOnlyQuantConfig(compute_dtype="fp32", weight_dtype="nf4")
+# fp8 weight_only quantization
+# woq_config = WeightOnlyQuantConfig(compute_dtype="fp32", weight_dtype="fp8")
 prompt = "Once upon a time, a little girl"
 
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 inputs = tokenizer(prompt, return_tensors="pt").input_ids
 streamer = TextStreamer(tokenizer)
 
-model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
 # top_k_top_p sample or greedy_search
+model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
 outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
 # beam search
+model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
 outputs = model.generate(inputs, num_beams=4, max_new_tokens=128, min_new_tokens=30, early_stopping=True)
 ans = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 print(ans)
@@ -52,26 +52,36 @@ def main(args_in: Optional[List[str]] = None) -> None:
  parser.add_argument("--nthread", type=int, help="Number of threads to use: Int (default: 1)", default=1)
  parser.add_argument(
  "--weight_dtype",
- choices=["int4", "int8"],
+ choices=["int4", "int8", "fp8", "fp8_e5m2", "fp8_e4m3",
+ "fp4", "fp4_e2m1", "nf4"],
  help="Data type of quantized weight: int4/int8 (default: int4)",
  default="int4",
  )
  parser.add_argument(
  "--alg",
  type=str,
+ choices=["sym", "asym"],
  help="Quantization algorithm to use: sym/asym (default: sym)",
  default="sym",
  )
- parser.add_argument("--group_size", type=int, help="Group size: Int (default: 32)", default=32)
+ parser.add_argument(
+ "--group_size",
+ type=int,
+ choices=[-1, 32, 128],
+ help="Group size: Int (default: 32)",
+ default=32,
+ )
  parser.add_argument(
  "--scale_dtype",
  type=str,
+ choices=["fp32", "bf16", "fp8"],
  help="Data type of scales: bf16/fp32 (default: fp32)",
  default="fp32",
  )
  parser.add_argument(
  "--compute_dtype",
  type=str,
+ choices=["fp32", "fp16", "bf16", "int8"],
  help="Data type of Gemm computation: int8/bf16/fp32 (default: int8)",
  default="int8",
  )
@@ -97,10 +107,28 @@ def main(args_in: Optional[List[str]] = None) -> None:
  cmd.extend(["--out_file", args.out_file])
  cmd.extend(["--nthread", str(args.nthread)])
  cmd.extend(["--weight_dtype", str(args.weight_dtype)])
- cmd.extend(["--alg", args.alg])
+ if (str(args.weight_dtype))[:3] in ["fp8", "fp4", "nf4"] and str(args.alg) in ["asym"]:
+ print("WARNING: asym alg is not be supported in float quant types. Fall back to sym.");
+ cmd.extend(["--alg", "sym"])
+ else:
+ cmd.extend(["--alg", args.alg])
  cmd.extend(["--group_size", str(args.group_size)])
- cmd.extend(["--scale_dtype", args.scale_dtype])
- cmd.extend(["--compute_dtype", args.compute_dtype])
+ if (str(args.weight_dtype))[:3] not in ["fp8"]:
+ sdtype = str(args.scale_dtype)
+ if str(args.scale_dtype) in ["fp8"]:
+ print("WARNING: fp8 scale is only be supported in fp8 weight type. Fall back to fp32.");
+ sdtype = "fp32"
+ cmd.extend(["--scale_dtype", sdtype])
+ else:
+ if str(args.scale_dtype) != "fp8":
+ print("WARNING: fp8 weight type only supports fp8 scale now.Fall back to fp8.")
+ cmd.extend(["--scale_dtype", "fp8"])
+ if (str(args.weight_dtype))[:3] in ["fp8", "fp4", "nf4"] and str(args.compute_dtype) in ["int8"]:
+ print("WARNING: int8 compute dtype is not be supported in float quant types! "\
+ "Fall back to fp32.")
+ cmd.extend(["--compute_dtype", "fp32"])
+ else:
+ cmd.extend(["--compute_dtype", args.compute_dtype])
  if args.use_ggml:
  cmd.extend(["--use_ggml"])
 
 
@@ -42,26 +42,36 @@ def main(args_in: Optional[List[str]] = None) -> None:
  # quantization related arguments.
  parser.add_argument(
  "--weight_dtype",
- choices=["int4", "int8"],
+ choices=["int4", "int8", "fp8", "fp8_e5m2", "fp8_e4m3",
+ "fp4", "fp4_e2m1", "nf4"],
  help="Data type of quantized weight: int4/int8 (default int4)",
  default="int4",
  )
  parser.add_argument(
  "--alg",
  type=str,
+ choices=["sym", "asym"],
  help="Quantization algorithm: sym/asym (default sym)",
  default="sym",
  )
- parser.add_argument("--group_size", type=int, help="Group size: Int (default: 32)", default=32)
+ parser.add_argument(
+ "--group_size",
+ type=int,
+ choices=[-1, 32, 128],
+ help="Group size: Int (default: 32)",
+ default=32,
+ )
  parser.add_argument(
  "--scale_dtype",
  type=str,
+ choices=["fp32", "bf16", "fp8"],
  help="Data type of scales: fp32/bf16 (dafault fp32)",
  default="fp32",
  )
  parser.add_argument(
  "--compute_dtype",
  type=str,
+ choices=["fp32", "fp16", "bf16", "int8"],
  help="Data type of Gemm computation: int8/bf16/fp32 (default: int8)",
  default="int8",
  )
Original file line number	Diff line number	Diff line change
`@@ -880,15 +880,18 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter`
`880`	`880`	`if (params.bits == quant_bits::q8) {`
`881`	`881`	`quant_type = JBLAS_DTYPE::S8;`
`882`	`882`	`}`
`883`		`- if (params.bits == quant_bits::fp4) {`
	`883`	`+ if (params.bits == quant_bits::fp4_e2m1) {`
`884`	`884`	`quant_type = JBLAS_DTYPE::F4_E2M1;`
`885`	`885`	`}`
`886`	`886`	`if (params.bits == quant_bits::nf4) {`
`887`	`887`	`quant_type = JBLAS_DTYPE::F4_NF4;`
`888`	`888`	`}`
`889`		`- if (params.bits == quant_bits::fp8) {`
	`889`	`+ if (params.bits == quant_bits::fp8_e4m3) {`
`890`	`890`	`quant_type = JBLAS_DTYPE::F8_E4M3;`
`891`	`891`	`}`
	`892`	`+ if (params.bits == quant_bits::fp8_e5m2) {`
	`893`	`+ quant_type = JBLAS_DTYPE::F8_E5M2;`
	`894`	`+ }`
`892`	`895`	`auto dtype_type = static_cast<JBLAS_DTYPE>(`
`893`	`896`	`jblas::utils::jblas_dtype_get_mask_val(quant_type, JBLAS_DTYPE::TypeMask, JBLAS_DTYPE::TypeShift));`
`894`	`897`	`if (dtype_type == JBLAS_DTYPE::TypeFloat) {`
`@@ -906,7 +909,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter`
`906`	`909`	`if (params.scale_dtype == quant_sdtype::fp16) {`
`907`	`910`	`printf("Current not support float16 scale, reset to bf16\n");`
`908`	`911`	`}`
`909`		`- if (quant_type == JBLAS_DTYPE::F8_E4M3) {`
	`912`	`+ if (quant_type == JBLAS_DTYPE::F8_E4M3 \|\| quant_type == JBLAS_DTYPE::F8_E5M2) {`
`910`	`913`	`if (params.scale_dtype != quant_sdtype::fp8) {`
`911`	`914`	`printf("Warning: fp8 weight only supports fp8 scale now\n");`
`912`	`915`	`}`
Original file line number	Diff line number	Diff line change
`@@ -18,22 +18,25 @@`
`18`	`18`	`#include "core/data_types.h"`
`19`	`19`	`#include "jblas/jit_blas.h"`
`20`	`20`
`21`		`-enum class quant_bits : int { q4 = 0, q8, fp4, nf4, fp8, count };`
	`21`	`+enum class quant_bits : int { q4 = 0, q8, fp4_e2m1, nf4, fp8_e4m3, fp8_e5m2, count };`
`22`	`22`	`static inline quant_bits parse_bits(const std::string& bits) {`
`23`	`23`	`if (bits == "int4") {`
`24`	`24`	`return quant_bits::q4;`
`25`	`25`	`}`
`26`	`26`	`if (bits == "int8") {`
`27`	`27`	`return quant_bits::q8;`
`28`	`28`	`}`
`29`		`- if (bits == "fp4") {`
`30`		`- return quant_bits::fp4;`
	`29`	`+ if (bits == "fp4_e2m1" \|\| bits == "fp4") {`
	`30`	`+ return quant_bits::fp4_e2m1;`
`31`	`31`	`}`
`32`	`32`	`if (bits == "nf4") {`
`33`	`33`	`return quant_bits::nf4;`
`34`	`34`	`}`
`35`		`- if (bits == "fp8") {`
`36`		`- return quant_bits::fp8;`
	`35`	`+ if (bits == "fp8_e4m3" \|\| bits == "fp8") {`
	`36`	`+ return quant_bits::fp8_e4m3;`
	`37`	`+ }`
	`38`	`+ if (bits == "fp8_e5m2") {`
	`39`	`+ return quant_bits::fp8_e5m2;`
`37`	`40`	`}`
`38`	`41`	`return quant_bits::count;`
`39`	`42`	`}`