Skip to content

Commit d1eda63

Browse files
authored
[Padding-Free Attention] Fix packed FA attention with pos ids only (#42801)
* fix position ids * style * fix
1 parent f5aa90d commit d1eda63

23 files changed

+3
-24
lines changed

src/transformers/models/deepseek_v2/modeling_deepseek_v2.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,6 @@ def forward(
342342
past_key_values: Optional[Cache] = None,
343343
cache_position: Optional[torch.LongTensor] = None,
344344
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
345-
position_ids: Optional[torch.Tensor] = None,
346345
**kwargs,
347346
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
348347
batch_size, seq_length = hidden_states.shape[:-1]

src/transformers/models/deepseek_v2/modular_deepseek_v2.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,6 @@ def forward(
369369
past_key_values: Optional[Cache] = None,
370370
cache_position: Optional[torch.LongTensor] = None,
371371
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
372-
position_ids: Optional[torch.Tensor] = None,
373372
**kwargs,
374373
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
375374
batch_size, seq_length = hidden_states.shape[:-1]

src/transformers/models/dia/modeling_dia.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -525,7 +525,6 @@ def forward(
525525
encoder_attention_mask: Optional[torch.Tensor] = None,
526526
past_key_values: Optional[EncoderDecoderCache] = None,
527527
cache_position: Optional[torch.LongTensor] = None,
528-
position_ids: Optional[torch.LongTensor] = None,
529528
**kwargs,
530529
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
531530
self_attn_cache = past_key_values

src/transformers/models/dia/modular_dia.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,6 @@ def forward(
314314
encoder_attention_mask: Optional[torch.Tensor] = None,
315315
past_key_values: Optional[EncoderDecoderCache] = None,
316316
cache_position: Optional[torch.LongTensor] = None,
317-
position_ids: Optional[torch.LongTensor] = None,
318317
**kwargs,
319318
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
320319
self_attn_cache = past_key_values

src/transformers/models/doge/modeling_doge.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,6 @@ def forward(
297297
attention_mask: Optional[torch.Tensor] = None,
298298
past_key_values: Optional[Cache] = None,
299299
cache_position: Optional[torch.LongTensor] = None,
300-
position_ids: Optional[torch.LongTensor] = None,
301300
**kwargs,
302301
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
303302
input_shape = hidden_states.shape[:-1]

src/transformers/models/doge/modular_doge.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,6 @@ def forward(
321321
attention_mask: Optional[torch.Tensor] = None,
322322
past_key_values: Optional[Cache] = None,
323323
cache_position: Optional[torch.LongTensor] = None,
324-
position_ids: Optional[torch.LongTensor] = None,
325324
**kwargs,
326325
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
327326
input_shape = hidden_states.shape[:-1]

src/transformers/models/exaone4/modeling_exaone4.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,6 @@ def forward(
240240
attention_mask: Optional[torch.Tensor] = None,
241241
past_key_values: Optional[Cache] = None,
242242
cache_position: Optional[torch.LongTensor] = None,
243-
position_ids: Optional[torch.LongTensor] = None,
244243
**kwargs: Unpack[TransformersKwargs],
245244
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
246245
input_shape = hidden_states.shape[:-1]

src/transformers/models/exaone4/modular_exaone4.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,6 @@ def forward(
260260
attention_mask: Optional[torch.Tensor] = None,
261261
past_key_values: Optional[Cache] = None,
262262
cache_position: Optional[torch.LongTensor] = None,
263-
position_ids: Optional[torch.LongTensor] = None,
264263
**kwargs: Unpack[TransformersKwargs],
265264
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
266265
input_shape = hidden_states.shape[:-1]

src/transformers/models/flex_olmo/modeling_flex_olmo.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,6 @@ def forward(
252252
attention_mask: Optional[torch.Tensor],
253253
past_key_values: Optional[Cache] = None,
254254
cache_position: Optional[torch.LongTensor] = None,
255-
position_ids: Optional[torch.LongTensor] = None,
256255
**kwargs: Unpack[TransformersKwargs],
257256
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
258257
input_shape = hidden_states.shape[:-1]

src/transformers/models/gpt_oss/modeling_gpt_oss.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,6 @@ def forward(
344344
attention_mask: Optional[torch.Tensor],
345345
past_key_values: Optional[Cache] = None,
346346
cache_position: Optional[torch.LongTensor] = None,
347-
position_ids: Optional[torch.LongTensor] = None,
348347
**kwargs: Unpack[TransformersKwargs],
349348
) -> tuple[torch.Tensor, torch.Tensor]:
350349
input_shape = hidden_states.shape[:-1]
@@ -374,7 +373,6 @@ def forward(
374373
dropout=0.0 if not self.training else self.attention_dropout,
375374
scaling=self.scaling,
376375
sliding_window=self.sliding_window,
377-
position_ids=position_ids,
378376
s_aux=self.sinks, # diff with Llama
379377
**kwargs,
380378
)

0 commit comments

Comments
 (0)