intel
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/application/main_run.cpp‎
Lines changed: 0 additions & 3 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/application/main_run.cpp‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp‎
Lines changed: 4 additions & 10 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp‎
Lines changed: 4 additions & 10 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/models/llama/llama.cpp‎
Lines changed: 7 additions & 16 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/models/llama/llama.cpp‎
Lines changed: 7 additions & 16 deletions
@@ -461,12 +461,9 @@ int main(int argc, char** argv) {
 
  std::vector<model_token_data> candidates;
  candidates.reserve(n_vocab);
- std::ofstream outFile("logits.txt", std::ios::app);
  for (model_token token_id = 0; token_id < n_vocab; token_id++) {
- outFile << logits[token_id] << " ";
  candidates.emplace_back(model_token_data{token_id, logits[token_id], 0.0f});
  }
- outFile << "\n";
 
  model_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
 
 
@@ -173,15 +173,7 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok
  1 * N * n_embd * batch_size * ne_element_size(QKVcur)),
  n_embd / n_head, n_head, N, batch_size),
  n_past, n_rot, 0, 0);
- if (!run_mha_reordered) {
- Vcur = ne_view_1d(ctx0, QKVcur, N * n_embd * batch_size, 2 * N * n_embd * batch_size * ne_element_size(QKVcur));
- } else {
- Vcur = ne_reshape_4d(
- ctx0,
- ne_view_1d(ctx0, QKVcur, N * n_embd * batch_size, 2 * N * n_embd * batch_size * ne_element_size(QKVcur)),
- n_embd / n_head, n_head, N, batch_size);
- }
-
+ Vcur = ne_view_1d(ctx0, QKVcur, N * n_embd * batch_size, 2 * N * n_embd * batch_size * ne_element_size(QKVcur));
  } else {
  if (!enable_tp) {
  // printf("\n\n\n work into attention split,\n\n\n");
@@ -291,7 +283,9 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok
  head_size, n_ctx, n_head, batch_size, // ne
  0, 0, v_size, // nb (jblas managed)
  il * kv_n_ctx_block * v_size); // offset
- ne_build_forward_expand(&gf, ne_flash_attn_update_v(ctx0, v_cache, Vcur, n_past));
+ // jblas alway view V as (D, n_head, seq, bs)
+ const auto Vcur_plain = ne_reshape_4d(ctx0, Vcur, n_embd / n_head, n_head, N, batch_size);
+ ne_build_forward_expand(&gf, ne_flash_attn_update_v(ctx0, v_cache, Vcur_plain, n_past));
  }
 
  struct ne_tensor* Q = ne_permute(ctx0, Qcur, 0, 2, 1, 3);
 
@@ -149,14 +149,9 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to
  ne_reshape_3d(ctx0, ne_view_1d(ctx0, QKVcur, N * n_embd, 1 * N * n_embd * ne_element_size(QKVcur)),
  n_embd / n_head, n_head, N),
  n_past, n_rot, 0, 0);
- if (!run_mha_reordered) {
- Vcur = ne_transpose(
- ctx0, ne_reshape_2d(ctx0, ne_view_1d(ctx0, QKVcur, N * n_embd, 2 * N * n_embd * ne_element_size(QKVcur)),
- n_embd, N));
- } else {
- Vcur = ne_reshape_3d(ctx0, ne_view_1d(ctx0, QKVcur, N * n_embd, 2 * N * n_embd * ne_element_size(QKVcur)),
- n_embd / n_head, n_head, N);
- }
+ Vcur = ne_transpose(
+ ctx0, ne_reshape_2d(ctx0, ne_view_1d(ctx0, QKVcur, N * n_embd, 2 * N * n_embd * ne_element_size(QKVcur)),
+ n_embd, N));
 
  } else {
  Qcur = ne_rope_inplace(
@@ -165,13 +160,7 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to
  Kcur = ne_rope_inplace(
  ctx0, ne_reshape_3d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[1], cur), n_embd / n_head, n_head, N),
  n_past, n_rot, 0, 0);
- if (!run_mha_reordered) {
- Vcur = ne_transpose(ctx0, ne_reshape_2d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[2], cur), n_embd, N));
- } else {
- Vcur = ne_rope_inplace(
- ctx0, ne_reshape_3d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[2], cur), n_embd / n_head, n_head, N),
- n_past, n_rot, 0, 0);
- }
+ Vcur = ne_transpose(ctx0, ne_reshape_2d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[2], cur), n_embd, N));
  }
  ne_set_name(Qcur, "Qcur");
  ne_set_name(Kcur, "Kcur");
@@ -258,7 +247,9 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to
  head_size, n_ctx, n_head, // ne
  0, 0, // nb (jblas managed)
  il * v_size); // offset
- ne_build_forward_expand(&gf, ne_flash_attn_update_v(ctx0, v_cache, Vcur, n_past));
+ // jblas alway view V as (D, n_head, seq)
+ const auto Vcur_plain = ne_reshape_3d(ctx0, ne_view_1d(ctx0, Vcur, n_embd * N, 0), n_embd / n_head, n_head, N);
+ ne_build_forward_expand(&gf, ne_flash_attn_update_v(ctx0, v_cache, Vcur_plain, n_past));
  }
 
  struct ne_tensor* Q = ne_permute(ctx0, Qcur, 0, 2, 1, 3);