modern-fortran
diff --git a/‎src/nf/nf_multihead_attention.f90‎
Lines changed: 34 additions & 6 deletions b/‎src/nf/nf_multihead_attention.f90‎
Lines changed: 34 additions & 6 deletions
@@ -39,10 +39,25 @@ module nf_multihead_attention_layer
  real, allocatable :: k_input(:, :)
  real, allocatable :: v_input(:, :)
  real, allocatable :: o_input(:, :)
+
+ ! temporary storages for forward and backward passes
+ real, allocatable :: normalized_attention(:, :, :)
+ real, allocatable :: q_or_dq(:, :, :)
+ real, allocatable :: k_or_dk(:, :, :)
+ real, allocatable :: v_or_dv(:, :, :)
+ real, allocatable :: d_output(:, :, :)
+ real, allocatable :: v_heads(:, :, :)
+ real, allocatable :: k_heads(:, :, :)
+ real, allocatable :: q_heads(:, :, :)
+ real, allocatable :: d_sdpa(:, :)
+ real, allocatable :: jacobian(:, :)
+ real, allocatable :: d_normalize(:, :, :)
  contains
 
  procedure :: common_backward
  procedure :: common_forward
+ procedure :: sdpa_forward
+ procedure :: sdpa_backward
  procedure :: get_num_params
  procedure :: get_params
  procedure :: get_gradients
@@ -68,25 +83,38 @@ end function multihead_attention_layer_cons
 
  interface
 
- pure module subroutine common_backward(self, input, gradient)
+ pure module subroutine common_backward(self, input, gradient, attention_mask)
  !! General backprop for MultiHead Attention mechanism
  !! Might be used for both Self and Cross Attention
  !! Self Attention: sum output gradients
  !! Cross Attention: use them separately
  class(multihead_attention_layer), intent(in out) :: self
  real, intent(in) :: input(:, :)
  real, intent(in) :: gradient(:, :)
+ real, optional, intent(in) :: attention_mask(:, :)
  end subroutine common_backward
 
- pure module subroutine common_forward(self, query, key, value)
+ pure module subroutine common_forward(self, query, key, value, attention_mask)
  !! General forward propagation for MultiHead Attention Mechanism
  !! Might be used for both Self and Cross Attention
  !! Self Attention: pass the same value thrice
  !! Cross Attention: pass three values for your query, key and value
  class(multihead_attention_layer), intent(in out) :: self
  real, intent(in) :: query(:, :), key(:, :), value(:, :)
+ real, optional, intent(in) :: attention_mask(:, :)
  end subroutine common_forward
 
+ pure module subroutine sdpa_forward(self, attention_mask)
+ class(multihead_attention_layer), intent(in out) :: self
+ real, intent(in), optional :: attention_mask(:, :)
+ end subroutine sdpa_forward
+
+ pure module subroutine sdpa_backward(self, gradient, attention_mask)
+ class(multihead_attention_layer), intent(in out) :: self
+ real, intent(in) :: gradient(:, :)
+ real, intent(in), optional :: attention_mask(:, :)
+ end subroutine sdpa_backward
+
  pure module subroutine init(self, input_shape)
  !! Initialize the layer data structures.
  !!
@@ -119,7 +147,7 @@ pure module subroutine normalize_attention_matrix(self, attention_mask)
  !! Output dims: sequence_length, sequence_length, n_heads
  class(multihead_attention_layer), intent(in out) :: self
  !! (sequence_length, sequence_length, n_heads)
- real, optional, intent(in) :: attention_mask(:, :, :)
+ real, optional, intent(in) :: attention_mask(:, :)
  !! (sequence_length, sequence_length, n_heads)
  end subroutine normalize_attention_matrix
 
@@ -143,18 +171,18 @@ elemental module function get_num_params(self) result(num_params)
  end function get_num_params
 
  module function get_params(self) result(params)
- class(multihead_attention_layer), intent(in), target :: self
+ class(multihead_attention_layer), intent(in) :: self
  real, allocatable :: params(:)
  end function get_params
 
  module function get_gradients(self) result(gradients)
- class(multihead_attention_layer), intent(in), target :: self
+ class(multihead_attention_layer), intent(in) :: self
  real, allocatable :: gradients(:)
  end function get_gradients
 
  module subroutine set_params(self, params)
  class(multihead_attention_layer), intent(in out) :: self
- real, intent(in), target :: params(:)
+ real, intent(in) :: params(:)
  end subroutine set_params
 
  module subroutine init_base(self, input_shape)