intel
diff --git a/‎.github/workflows/script/models/cpp_graph_inference.sh‎
Lines changed: 6 additions & 6 deletions b/‎.github/workflows/script/models/cpp_graph_inference.sh‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎intel_extension_for_transformers/backends/neural_engine/graph/application/common.cpp‎
Lines changed: 0 additions & 221 deletions b/‎intel_extension_for_transformers/backends/neural_engine/graph/application/common.cpp‎
Lines changed: 0 additions & 221 deletions
diff --git a/‎intel_extension_for_transformers/backends/neural_engine/graph/application/common.h‎
Lines changed: 0 additions & 5 deletions b/‎intel_extension_for_transformers/backends/neural_engine/graph/application/common.h‎
Lines changed: 0 additions & 5 deletions
@@ -17,38 +17,38 @@ function main() {
  if [[ "${model}" == "llama-7b-hf" ]]; then
  convert_script="${working_dir}/scripts/convert_llama.py"
  quant_script="./build/bin/quant_llama"
- infer_cmd="./build/bin/main_llama"
+ infer_cmd="./build/bin/chat_llama"
  input_model="/tf_dataset2/models/nlp_toolkit/llama-7b-hf"
  precision_list=("q4")
  elif [[ "${model}" == "gpt-neox-20b" ]]; then
  convert_script="${working_dir}/scripts/convert_gptneox.py"
  quant_script="./build/bin/quant_gptneox"
- infer_cmd="./build/bin/main_gptneox"
+ infer_cmd="./build/bin/chat_gptneox"
  input_model="/tf_dataset2/models/nlp_toolkit/gpt-neox-20b"
  precision_list=("q4_j_b128" "q4_j_b32" "q4_0")
  elif [[ "${model}" == "mpt-7b" ]]; then
  convert_script="${working_dir}/scripts/convert_mpt.py"
  quant_script="./build/bin/quant_mpt"
- infer_cmd="./build/bin/main_mpt"
+ infer_cmd="./build/bin/chat_mpt"
  input_model="/tf_dataset2/models/nlp_toolkit/mpt-7b"
  precision_list=("q4_j_b128" "q4_j_b32" "q4_0")
  elif [[ "${model}" == "falcon-7b" ]]; then
  convert_script="${working_dir}/scripts/convert_falcon.py"
  quant_script="./build/bin/quant_falcon"
- infer_cmd="./build/bin/main_falcon"
+ infer_cmd="./build/bin/chat_falcon"
  input_model="/tf_dataset2/models/nlp_toolkit/falcon-7b"
  precision_list=("q4_j_b128" "q4_j_b32" "q4_0")
  elif [[ "${model}" == "gptj-6b" ]]; then
  convert_script="${working_dir}/scripts/convert_gptj.py"
  quant_script="./build/bin/quant_gptj"
- infer_cmd="./build/bin/main_gptj"
+ infer_cmd="./build/bin/chat_gptj"
  model_name="EleutherAI/gpt-j-6b"
  input_model="/tf_dataset2/models/pytorch/gpt-j-6B"
  precision_list=("q4_j_b128" "q4_j_b32" "q4_0")
  elif [[ "${model}" == "starcoder-3b" ]]; then
  convert_script="${working_dir}/scripts/convert_starcoder.py"
  quant_script="./build/bin/quant_starcoder"
- infer_cmd="./build/bin/main_starcoder"
+ infer_cmd="./build/bin/chat_starcoder"
  model_name="bigcode/starcoder"
  input_model="/tf_dataset2/models/pytorch/starcode_3b"
  precision_list=("q4_j_b128" "q4_j_b32" "q4_0")
 
@@ -776,227 +776,6 @@ ne_type quant_params_to_type(const quant_params& params) {
  }
  return NE_TYPE_F32;
 }
-size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params params, int n, int k) {
- using CompType = jblas::prologue::weight_comp::gemm::WeightCompType;
- auto cd = jblas::utils::parallel::CpuDevice::getInstance();
- jblas::prologue::PackedWeight* packedw = NULL;
- auto type = CompType::S4_F32;
- if (params.bits == 4) {
- if (params.scale_dtype == "bf16") {
- type = CompType::S4_Bf16;
- } else {
- type = CompType::S4_F32;
- }
- } else if (params.bits == 8) {
- type = CompType::S8_F32;
- } else {
- return 0;
- }
- cd->setThreads(params.nthread);
- if (params.bits == 4) {
- if (params.compute_type == "int8") {
- using GemmKernel = jblas::wrapper::gemm_default::weight_comp::avx512_vnni::GemmKernelDynamicQuantS4KBlock;
- static GemmKernel kernel;
- assert(cd->AVX512F());
- packedw = kernel.getWeightPtr()->compressWeightTranspose(n, k, f32ptr, k, params.block_size, type);
- } else if (params.compute_type == "fp32") {
- using GemmKernel = jblas::wrapper::gemm_default::weight_comp::avx512f::GemmKernelS4KBlock;
- static GemmKernel kernel;
- assert(cd->AVX512F());
- packedw = kernel.getWeightPtr()->compressWeightTranspose(n, k, f32ptr, k, params.block_size, type);
- }
- } else if (params.bits == 8) {
- // TODO add 8bit quantization
- }
- assert(packedw != 0);
- auto size = packedw->getSerializedSize();
- packedw->serializeToBuffer(dstpr);
- delete packedw;
- return size;
-}
-
-bool ne_common_quantize_0(std::ifstream& finp, std::ofstream& fout, const quant_params params,
- const std::vector<std::string>& to_quant, const std::vector<std::string>& to_skip) {
- ne_type qtype = quant_params_to_type(params);
- if (!ne_is_quantized(qtype)) {
- fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ne_type_name(qtype));
- return false;
- }
-
- size_t total_size_org = 0;
- size_t total_size_new = 0;
-
- std::vector<float> work;
-
- std::vector<uint8_t> data_u8;
- std::vector<ne_fp16_t> data_f16;
- std::vector<float> data_f32;
-
- std::vector<int64_t> hist_all(1 << 4, 0);
-
- while (true) {
- int32_t n_dims;
- int32_t length;
- int32_t ttype;
-
- finp.read(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
- finp.read(reinterpret_cast<char*>(&length), sizeof(length));
- finp.read(reinterpret_cast<char*>(&ttype), sizeof(ttype));
-
- if (finp.eof()) {
- break;
- }
-
- int32_t nelements = 1;
- int32_t ne[4] = {1, 1, 1, 1};
- for (int i = 0; i < n_dims; ++i) {
- finp.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
- nelements *= ne[i];
- }
-
- std::string name(length, 0);
- finp.read(&name[0], length);
-
- printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ne_type_name((ne_type)ttype));
-
- bool quantize = false;
-
- // check if we should quantize this tensor
- for (const auto& s : to_quant) {
- if (std::regex_match(name, std::regex(s))) {
- quantize = true;
- break;
- }
- }
-
- // check if we should skip this tensor
- for (const auto& s : to_skip) {
- if (std::regex_match(name, std::regex(s))) {
- quantize = false;
- break;
- }
- }
-
- // quantize only 2D tensors
- quantize &= (n_dims == 2);
-
- if (quantize) {
- if (ttype != NE_TYPE_F32 && ttype != NE_TYPE_F16) {
- fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype,
- ne_type_name((ne_type)ttype));
- return false;
- }
-
- if (ttype == NE_TYPE_F16) {
- data_f16.resize(nelements);
- finp.read(reinterpret_cast<char*>(data_f16.data()), nelements * sizeof(ne_fp16_t));
- data_f32.resize(nelements);
- for (int i = 0; i < nelements; ++i) {
- data_f32[i] = ne_fp16_to_fp32(data_f16[i]);
- }
- } else {
- data_f32.resize(nelements);
- finp.read(reinterpret_cast<char*>(data_f32.data()), nelements * sizeof(float));
- }
-
- ttype = qtype;
- } else {
- const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
-
- data_u8.resize(nelements * bpe);
- finp.read(reinterpret_cast<char*>(data_u8.data()), nelements * bpe);
- }
-
- fout.write(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
- fout.write(reinterpret_cast<char*>(&length), sizeof(length));
- fout.write(reinterpret_cast<char*>(&ttype), sizeof(ttype));
- for (int i = 0; i < n_dims; ++i) {
- fout.write(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
- }
- fout.write(&name[0], length);
-
- if (quantize) {
- work.resize(nelements); // for quantization
-
- size_t cur_size = 0;
- std::vector<int64_t> hist_cur(1 << 4, 0);
-
- switch ((ne_type)ttype) {
- case NE_TYPE_Q4_0: {
- cur_size = ne_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case NE_TYPE_Q4_1: {
- cur_size = ne_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case NE_TYPE_Q5_0: {
- cur_size = ne_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case NE_TYPE_Q5_1: {
- cur_size = ne_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case NE_TYPE_Q8_0: {
- cur_size = ne_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case NE_TYPE_JBLAS: {
- cur_size = jblas_quantize(data_f32.data(), work.data(), params, ne[1], ne[0]);
- if (cur_size == 0) {
- fprintf(stderr, "%s: unsupported jblas quantization parameters %d %s %s\n", __func__, params.bits,
- params.alg.c_str(), params.compute_type.c_str());
- return false;
- }
- } break;
- case NE_TYPE_F16:
- case NE_TYPE_I8:
- case NE_TYPE_I16:
- case NE_TYPE_I32:
- case NE_TYPE_Q8_1:
- case NE_TYPE_COUNT: {
- fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ne_type_name((ne_type)ttype));
- return false;
- }
- }
-
- fout.write(reinterpret_cast<char*>(work.data()), cur_size);
- total_size_new += cur_size;
-
- printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float) / 1024.0 / 1024.0,
- cur_size / 1024.0 / 1024.0);
- for (int i = 0; i < (int)hist_cur.size(); ++i) {
- hist_all[i] += hist_cur[i];
- }
-
- for (int i = 0; i < (int)hist_cur.size(); ++i) {
- printf("%5.3f ", hist_cur[i] / (float)nelements);
- }
- printf("\n");
- } else {
- printf("size = %8.3f MB\n", data_u8.size() / 1024.0 / 1024.0);
- fout.write(reinterpret_cast<char*>(data_u8.data()), data_u8.size());
- total_size_new += data_u8.size();
- }
-
- total_size_org += nelements * sizeof(float);
- }
-
- printf("%s: model size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
- printf("%s: quant size = %8.2f MB | qtype = %d (%s)\n", __func__, total_size_new / 1024.0 / 1024.0, qtype,
- ne_type_name(qtype));
-
- {
- int64_t sum_all = 0;
- for (int i = 0; i < (int)hist_all.size(); ++i) {
- sum_all += hist_all[i];
- }
-
- printf("%s: hist: ", __func__);
- for (int i = 0; i < (int)hist_all.size(); ++i) {
- printf("%5.3f ", hist_all[i] / (float)sum_all);
- }
- printf("\n");
- }
-
- return true;
-}
 
 void console_init(console_state& con_st) {
 #if defined(_WIN32)
 
@@ -160,11 +160,6 @@ ne_ftype quant_params_to_ftype(const quant_params& params);
 
 bool quant_params_parse(int argc, char** argv, quant_params& params);
 
-size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params params, int n, int k);
-
-bool ne_common_quantize_0(std::ifstream& finp, std::ofstream& fout, const quant_params params,
- const std::vector<std::string>& to_quant, const std::vector<std::string>& to_skip);
-
 #define ANSI_COLOR_RED "\x1b[31m"
 #define ANSI_COLOR_GREEN "\x1b[32m"
 #define ANSI_COLOR_YELLOW "\x1b[33m"