intel
diff --git a/‎csrc/cpu/aten/PagedAttention.cpp‎
Lines changed: 8 additions & 8 deletions b/‎csrc/cpu/aten/PagedAttention.cpp‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎csrc/cpu/aten/PagedAttention.h‎
Lines changed: 8 additions & 8 deletions b/‎csrc/cpu/aten/PagedAttention.h‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp‎
Lines changed: 30 additions & 31 deletions b/‎csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp‎
Lines changed: 30 additions & 31 deletions
diff --git a/‎intel_extension_for_pytorch/_meta_registrations.py‎
Lines changed: 5 additions & 1 deletion b/‎intel_extension_for_pytorch/_meta_registrations.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎intel_extension_for_pytorch/llm/modules/mha_fusion.py‎
Lines changed: 11 additions & 11 deletions b/‎intel_extension_for_pytorch/llm/modules/mha_fusion.py‎
Lines changed: 11 additions & 11 deletions
@@ -24,9 +24,9 @@ void single_query_cached_kv_attention_forward_cpu(
  at::Tensor& context_lens, // [num_seqs]
  int64_t block_size,
  int64_t max_context_len,
+ const c10::optional<at::Tensor>& alibi_slopes,
  const double k_scale,
- const double v_scale,
- const c10::optional<at::Tensor>& alibi_slopes) {
+ const double v_scale) {
  return single_query_cached_kv_attention_kernel_stub(
  kCPU,
  out,
@@ -39,9 +39,9 @@ void single_query_cached_kv_attention_forward_cpu(
  context_lens,
  block_size,
  max_context_len,
+ alibi_slopes,
  k_scale,
- v_scale,
- alibi_slopes);
+ v_scale);
 }
 
 void reshape_and_cache_cpu(
@@ -68,9 +68,9 @@ void flash_attn_varlen_cpu(
  const double softmax_scale,
  bool is_causal,
  at::Tensor& block_table,
+ const c10::optional<at::Tensor>& alibi_slopes,
  const double k_scale,
- const double v_scale,
- const c10::optional<at::Tensor>& alibi_slopes) {
+ const double v_scale) {
  return flash_attn_var_len_kernel_stub(
  kCPU,
  out,
@@ -84,9 +84,9 @@ void flash_attn_varlen_cpu(
  softmax_scale,
  is_causal,
  block_table,
+ alibi_slopes,
  k_scale,
- v_scale,
- alibi_slopes);
+ v_scale);
 }
 
 } // namespace cpu
 
@@ -19,9 +19,9 @@ void single_query_cached_kv_attention(
  at::Tensor& context_lens, // [num_seqs]
  int64_t block_size,
  int64_t max_context_len,
+ const c10::optional<at::Tensor>& alibi_slopes,
  const double k_scale,
- const double v_scale,
- const c10::optional<at::Tensor>& alibi_slopes);
+ const double v_scale);
 }
 
 void reshape_and_cache(
@@ -45,9 +45,9 @@ void flash_attn_varlen(
  const double softmax_scale,
  bool is_causal,
  at::Tensor& block_table,
+ const c10::optional<at::Tensor>& alibi_slopes,
  const double k_scale,
- const double v_scale,
- const c10::optional<at::Tensor>& alibi_slopes);
+ const double v_scale);
 
 using single_query_cached_kv_attention_fn = void (*)(
  at::Tensor& out, // [num_seqs, num_heads, head_size]
@@ -60,9 +60,9 @@ using single_query_cached_kv_attention_fn = void (*)(
  at::Tensor& context_lens, // [num_seqs]
  int64_t block_size,
  int64_t max_context_len,
+ const c10::optional<at::Tensor>& alibi_slopes,
  const double k_scale,
- const double v_scale,
- const c10::optional<at::Tensor>& alibi_slopes);
+ const double v_scale);
 
 using reshape_and_cache_fn = void (*)(
  at::Tensor& key,
@@ -85,9 +85,9 @@ using flash_attn_var_len_fn = void (*)(
  const double softmax_scale,
  bool is_causal,
  at::Tensor& block_table,
+ const c10::optional<at::Tensor>& alibi_slopes,
  const double k_scale,
- const double v_scale,
- const c10::optional<at::Tensor>& alibi_slopes);
+ const double v_scale);
 
 IPEX_DECLARE_DISPATCH(
  single_query_cached_kv_attention_fn,
 
@@ -409,10 +409,10 @@ inline void _mul_reduce_max_fusion_kernel(
  * @param block_size The block size which means the number of token in every
  * block.
  * @param max_context_len Maximum context length.
- * @param k_scale Scaling factor for key cache of data type fp8.
- * @param v_scale Scaling factor for value cache of data type fp8.
  * @param alibi_slopes Optional tensor of alibi slopes with the shape of
  * (num_heads).
+ * @param k_scale Scaling factor for key cache of data type fp8.
+ * @param v_scale Scaling factor for value cache of data type fp8.
  */
 template <typename scalar_t, typename cache_t>
 void single_query_cached_kv_attention_kernel(
@@ -425,9 +425,9 @@ void single_query_cached_kv_attention_kernel(
  at::Tensor& context_lens,
  int64_t block_size,
  int64_t max_context_len,
+ const c10::optional<at::Tensor>& alibi_slopes,
  const double k_scale,
- const double v_scale,
- const c10::optional<at::Tensor>& alibi_slopes) {
+ const double v_scale) {
  auto out_ptr = out.data_ptr<scalar_t>();
  auto query_ptr = query.data_ptr<scalar_t>();
  auto key_cache_ptr = key_cache.data_ptr<cache_t>();
@@ -807,9 +807,9 @@ void flash_attn_varlen_kernel(
  const double softmax_scale, // scale for softmax
  bool is_causal, // whether the attention is causal
  at::Tensor& block_table,
+ const c10::optional<at::Tensor>& alibi_slopes,
  const double k_scale,
- const double v_scale,
- const c10::optional<at::Tensor>& alibi_slopes) {
+ const double v_scale) {
  auto kv_block_strideN = key_cache.stride(0);
  auto kv_block_strideH = key_cache.stride(1);
  auto kv_block_strideP = key_cache.stride(2);
@@ -1027,9 +1027,9 @@ void single_query_cached_kv_attention_kernel_impl(
  at::Tensor& context_lens, // [num_seqs]
  int64_t block_size,
  int64_t max_context_len,
+ const c10::optional<at::Tensor>& alibi_slopes,
  const double k_scale,
- const double v_scale,
- const c10::optional<at::Tensor>& alibi_slopes) {
+ const double v_scale) {
  RECORD_FUNCTION(
  "ipex::single_query_cached_kv_attention_kernel_impl",
  c10::ArrayRef<c10::IValue>({}));
@@ -1046,9 +1046,9 @@ void single_query_cached_kv_attention_kernel_impl(
  context_lens,
  block_size,
  max_context_len,
+ alibi_slopes,
  k_scale,
- v_scale,
- alibi_slopes);
+ v_scale);
  } else if (out.scalar_type() == at::ScalarType::Float) {
  single_query_cached_kv_attention_kernel<float, float>(
  out,
@@ -1060,9 +1060,9 @@ void single_query_cached_kv_attention_kernel_impl(
  context_lens,
  block_size,
  max_context_len,
+ alibi_slopes,
  k_scale,
- v_scale,
- alibi_slopes);
+ v_scale);
  } else if (out.scalar_type() == at::ScalarType::BFloat16) {
  single_query_cached_kv_attention_kernel<at::BFloat16, at::BFloat16>(
  out,
@@ -1074,9 +1074,9 @@ void single_query_cached_kv_attention_kernel_impl(
  context_lens,
  block_size,
  max_context_len,
+ alibi_slopes,
  k_scale,
- v_scale,
- alibi_slopes);
+ v_scale);
  } else if (out.scalar_type() == at::ScalarType::Half) {
  single_query_cached_kv_attention_kernel<at::Half, at::Half>(
  out,
@@ -1088,9 +1088,9 @@ void single_query_cached_kv_attention_kernel_impl(
  context_lens,
  block_size,
  max_context_len,
+ alibi_slopes,
  k_scale,
- v_scale,
- alibi_slopes);
+ v_scale);
  } else {
  TORCH_CHECK(
  false, "Unsupported data type for single_query_cached_kv_attention");
@@ -1152,9 +1152,9 @@ void flash_attn_varlen_cpu_kernel_impl(
  const double softmax_scale,
  bool is_causal,
  at::Tensor& block_table,
+ const c10::optional<at::Tensor>& alibi_slopes,
  const double k_scale,
- const double v_scale,
- const c10::optional<at::Tensor>& alibi_slopes) {
+ const double v_scale) {
  TORCH_CHECK(
  key.scalar_type() == value.scalar_type(),
  "key and value should have the same data type");
@@ -1173,7 +1173,6 @@ void flash_attn_varlen_cpu_kernel_impl(
  if (query.scalar_type() == at::ScalarType::Float) {
  if (max_seqlen_q >= 768) {
  flash_attn_varlen_kernel<float, float, 128>(
-
  out,
  query,
  key,
@@ -1185,9 +1184,9 @@ void flash_attn_varlen_cpu_kernel_impl(
  softmax_scale,
  is_causal,
  block_table,
+ alibi_slopes,
  k_scale,
- v_scale,
- alibi_slopes);
+ v_scale);
  } else if (max_seqlen_q >= 192) {
  flash_attn_varlen_kernel<float, float, 64>(
  out,
@@ -1201,9 +1200,9 @@ void flash_attn_varlen_cpu_kernel_impl(
  softmax_scale,
  is_causal,
  block_table,
+ alibi_slopes,
  k_scale,
- v_scale,
- alibi_slopes);
+ v_scale);
  } else {
  flash_attn_varlen_kernel<float, float, 32>(
  out,
@@ -1217,9 +1216,9 @@ void flash_attn_varlen_cpu_kernel_impl(
  softmax_scale,
  is_causal,
  block_table,
+ alibi_slopes,
  k_scale,
- v_scale,
- alibi_slopes);
+ v_scale);
  }
 
  } else if (query.scalar_type() == at::ScalarType::BFloat16) {
@@ -1236,9 +1235,9 @@ void flash_attn_varlen_cpu_kernel_impl(
  softmax_scale,
  is_causal,
  block_table,
+ alibi_slopes,
  k_scale,
- v_scale,
- alibi_slopes);
+ v_scale);
  } else if (max_seqlen_q >= 192) {
  flash_attn_varlen_kernel<at::BFloat16, at::BFloat16, 64>(
  out,
@@ -1252,9 +1251,9 @@ void flash_attn_varlen_cpu_kernel_impl(
  softmax_scale,
  is_causal,
  block_table,
+ alibi_slopes,
  k_scale,
- v_scale,
- alibi_slopes);
+ v_scale);
  } else {
  flash_attn_varlen_kernel<at::BFloat16, at::BFloat16, 32>(
  out,
@@ -1268,9 +1267,9 @@ void flash_attn_varlen_cpu_kernel_impl(
  softmax_scale,
  is_causal,
  block_table,
+ alibi_slopes,
  k_scale,
- v_scale,
- alibi_slopes);
+ v_scale);
  }
 
  } else {
 
@@ -130,7 +130,9 @@ def is_channels_last_3d(ten):
 
 
 @register_meta("reshape_and_cache")
-def meta_reshape_and_cache(key, value, key_cache, value_cache, slot_mapping):
+def meta_reshape_and_cache(
+ key, value, key_cache, value_cache, slot_mapping, k_scale, v_scale
+):
  return None
 
 
@@ -147,6 +149,8 @@ def meta_single_query_cached_kv_attention(
  block_size,
  max_context_len,
  alibi_slopes,
+ k_scale,
+ v_scale,
 ):
  return None
 
 
@@ -472,9 +472,9 @@ class PagedAttention:
  context_lens,
  block_size,
  max_context_len,
+ alibi_slopes,
  k_scale,
  v_scale,
- alibi_slopes
  )
 
  This operator is used to be calculated the scale-dot-product based on the paged attention.
@@ -518,9 +518,9 @@ class PagedAttention:
  scale,
  is_cusal,
  block_tables,
+ alibi_slopes,
  key_cache,
  val_cache,
- alibi_slopes
  )
 
  Args:
@@ -539,9 +539,9 @@ class PagedAttention:
  is_cusal (bool): Whether to apply causal attention masking. Default is True. False is not supported yet.
  block_tables:(torch.Tensor): The mapping table used to mapping the logical sequence
  to the physical sequence. The shape should be [batch_size, max_num_blocks_per_seq].
+ alibi_slopes (torch.Tensor, optinal): which is the alibi slope with the shape of (num_heads).
  k_scale (float): The scale used by the fp8 key cache.
  v_scale (float): The scale used by the fp8 value cache.
- alibi_slopes (torch.Tensor, optinal): which is the alibi slope with the shape of (num_heads).
 
  """
 
@@ -555,8 +555,8 @@ def reshape_and_cache(
  key_cache: torch.Tensor,
  value_cache: torch.Tensor,
  slot_mapping: torch.Tensor,
- k_scale: float,
- v_scale: float,
+ k_scale: float = 1.0,
+ v_scale: float = 1.0,
  ):
  return cls.runtime_ops.get_module_from_device(
  key.device.type, IPEXCustomOpType.PAGED_ATTENTION, False
@@ -577,9 +577,9 @@ def single_query_cached_kv_attention(
  context_lens: torch.Tensor,
  block_size: int,
  max_context_len: int,
- k_scale: float,
- v_scale: float,
  alibi_slopes: torch.Tensor,
+ k_scale: float = 1.0,
+ v_scale: float = 1.0,
  ):
  return cls.runtime_ops.get_module_from_device(
  output.device.type, IPEXCustomOpType.PAGED_ATTENTION, False
@@ -594,9 +594,9 @@ def single_query_cached_kv_attention(
  context_lens,
  block_size,
  max_context_len,
+ alibi_slopes,
  k_scale,
  v_scale,
- alibi_slopes,
  )
 
  @classmethod
@@ -613,9 +613,9 @@ def flash_attn_varlen_func(
  scale,
  is_cusal: bool,
  block_tables: torch.Tensor,
- k_scale: float,
- v_scale: float,
  alibi_slopes: torch.Tensor,
+ k_scale: float = 1.0,
+ v_scale: float = 1.0,
  ):
  return cls.runtime_ops.get_module_from_device(
  output.device.type, IPEXCustomOpType.PAGED_ATTENTION, False
@@ -631,9 +631,9 @@ def flash_attn_varlen_func(
  scale,
  is_cusal,
  block_tables,
+ alibi_slopes,
  k_scale,
  v_scale,
- alibi_slopes,
  )