pytorch-tpu
diff --git a/‎src/transformers/models/llama/modeling_llama.py‎
Lines changed: 12 additions & 8 deletions b/‎src/transformers/models/llama/modeling_llama.py‎
Lines changed: 12 additions & 8 deletions
@@ -368,16 +368,20 @@ def forward(
  key_states = repeat_kv(key_states, self.num_key_value_groups)
  value_states = repeat_kv(value_states, self.num_key_value_groups)
 
- attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+ # attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
- if attention_mask is not None: # no matter the length, we just slice it
- causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
- attn_weights = attn_weights + causal_mask
+ # if attention_mask is not None: # no matter the length, we just slice it
+ #  causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+ #  attn_weights = attn_weights + causal_mask
 
- # upcast attention to fp32
- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
- attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
- attn_output = torch.matmul(attn_weights, value_states)
+ # # upcast attention to fp32
+ # attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ # attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+ # attn_output = torch.matmul(attn_weights, value_states)
+
+ # Integrated with PyTorch/XLA Pallas Flash Attention:
+ from torch_xla.experimental.custom_kernel import flash_attention
+ attn_output = flash_attention(query_states, key_states, value_states, partition_spec=('fsdp', None, None, None))
 
  if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
  raise ValueError(