diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index f408b646d35b..b8f548e44e23 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -134,6 +134,7 @@ def forward( hidden_states, attention_mask=None, layer_past=None, + position_ids: torch.LongTensor | None = None, use_cache=False, output_attentions=False, **kwargs, @@ -178,6 +179,7 @@ def forward( hidden_states, attention_mask=None, layer_past=None, + position_ids: torch.LongTensor | None = None, use_cache=False, output_attentions=False, **kwargs, @@ -238,6 +240,7 @@ def forward( value, attention_mask, query_length, + position_ids=position_ids, dropout=attn_dropout, softmax_scale=1.0, is_causal=self.is_causal, @@ -279,6 +282,7 @@ def forward( hidden_states, layer_past=None, attention_mask=None, + position_ids: torch.LongTensor | None = None, use_cache=False, output_attentions=False, **kwargs, @@ -287,6 +291,7 @@ def forward( hidden_states, attention_mask=attention_mask, layer_past=layer_past, + position_ids=position_ids, use_cache=use_cache, output_attentions=output_attentions, ) @@ -324,6 +329,7 @@ def forward( hidden_states, layer_past=None, attention_mask=None, + position_ids: torch.LongTensor | None = None, use_cache=False, output_attentions=False, **kwargs, @@ -334,6 +340,7 @@ def forward( hidden_states, layer_past=layer_past, attention_mask=attention_mask, + position_ids=position_ids, use_cache=use_cache, output_attentions=output_attentions, ) @@ -481,6 +488,7 @@ def forward( hidden_states, layer_past=past_key_values, attention_mask=causal_mask, + position_ids=position_ids, use_cache=use_cache, output_attentions=output_attentions, )