fix out proj in attn

huggingface · LysandreJik · Mar 30, 2021 · Mar 22, 2021 · Mar 24, 2021 · Mar 24, 2021
commit 8255753cebde7e0955ad9ebfd475fc9cccdedc52
diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
@@ -96,7 +96,20 @@ def __init__(
         n_ctx=1024,
         n_embd=768,
         n_layer=12,
-        attn_layers=["global", "local", "global", "local", "global", "local", "global", "local", "global", "local", "global", "local"],
+        attn_layers=[
+            "global",
+            "local",
+            "global",
+            "local",
+            "global",
+            "local",
+            "global",
+            "local",
+            "global",
+            "local",
+            "global",
+            "local",
+        ],
         n_head=12,
         n_inner=None,
         activation_function="gelu_new",

diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -35,7 +35,8 @@
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     BaseModelOutputWithPastAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions, CausalLMOutputWithPast,
+    CausalLMOutputWithCrossAttentions,
+    CausalLMOutputWithPast,
     MaskedLMOutput,
     MultipleChoiceModelOutput,
     QuestionAnsweringModelOutput,
@@ -154,9 +155,7 @@ def __init__(self, nx, n_ctx, config, scale=False):
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
-
-        self.attn_bias = nn.Parameter(torch.zeros(self.embed_dim))
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
 
     def _attn(self, q, k, v, attention_mask=None, head_mask=None, output_attentions=False):
         w = torch.matmul(q, k)
@@ -230,17 +229,14 @@ def forward(
 
         a = self.merge_heads(a)
         a = self.out_proj(a)
-        # a = self.resid_dropout(a)
-
-        a += self.attn_bias
+        a = self.resid_dropout(a)
 
         return (a, present) + attn_outputs[1:]  # a, present, (attentions)
 
 
 class LocalAttention(nn.Module):
     def __init__(self, nx, n_ctx, config, scale=False):
         super().__init__()
-        print("init local")
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
         assert n_state % config.n_head == 0
@@ -268,9 +264,7 @@ def __init__(self, nx, n_ctx, config, scale=False):
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
-
-        self.attn_bias = nn.Parameter(torch.zeros(self.embed_dim))
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
 
         self.window_size = config.window_size
 
@@ -350,8 +344,7 @@ def forward(
         attn = attn.reshape(-1, seq_len, self.embed_dim)
 
         attn = self.out_proj(attn)
-
-        attn += self.attn_bias
+        attn = self.resid_dropout(attn)
-        attn = self.resid_dropout(attn)
+        attn = self.residual_dropout(attn)
-        attn = self.resid_dropout(attn)
+        attn = self.residual_dropout(attn)
         return (attn,)