Skip to content

Commit 1510acb

Browse files
committed
Update for bugging myself to focus =)
- I separated Google's and OpenAI's versions of Transformer due to better code reading (since they both have many non-intersecting helper classes) - Transformer_OpenAI.py is created. Obviously, model is not ready. - I am using both the paper and the huggingface version of implemen- tation as my resources. But probably, I won't hardly c/p the code of huggingface since I want this part of the project has same architectural flow with other implemented models. - Added gelu activation function to utils.
1 parent 988c426 commit 1510acb

File tree

4 files changed

+103
-15
lines changed

4 files changed

+103
-15
lines changed

config/config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,8 @@
118118
}
119119
},
120120
"training_properties": {
121-
"learner": "lstmcrf",
122-
"task": "ner",
121+
"learner": "transformer_google",
122+
"task": "classification",
123123
"optimizer": "SGD",
124124
"learning_rate": 0.05,
125125
"final_learning_rate": 0.1,

models/Transformer.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,6 @@ def forward(self, x):
2424
return self.a_2 * (x - mean) / (std + self.epsilon) + self.b_2
2525

2626

27-
class LayerNormOpenAI(nn.Module):
28-
def __init__(self, features, epsilon=1e-5):
29-
self.a_2 = nn.Parameter(torch.ones(features))
30-
self.b_2 = nn.Parameter(torch.zeros(features))
31-
self.epsilon = epsilon
32-
33-
def forward(self, x):
34-
mean = x.mean(-1, keepdim=True)
35-
std = x.std(-1, keepdim=True)
36-
return self.a_2 * (x - mean) / torch.sqrt(std + self.epsilon) + self.b_2
37-
38-
3927
class EncoderBlockGoogle(nn.Module):
4028
def __init__(self, layer, num_layers):
4129
super(EncoderBlockGoogle, self).__init__()
@@ -206,7 +194,7 @@ def forward(self, input):
206194
return self.dropout(input + Variable(self.pe[:, :input.size(1)], requires_grad=False))
207195

208196

209-
class TransformerGoogle():
197+
class TransformerGoogle:
210198
def __init__(self, args):
211199
super(TransformerGoogle, self).__init__()
212200

@@ -262,6 +250,8 @@ def create_classifier_transformer(self):
262250
c = copy.deepcopy
263251

264252
# Initialize individual parts of the full model
253+
# attention = torch.nn.MultiheadAttention(num_heads=self.heads, embed_dim=self.embed_dim,
254+
# dropout=self.keep_prob_attn)
265255
attention = MultiHeadedAttentionGoogle(heads=self.heads, d_model=self.embed_dim, keep_prob=self.keep_prob_attn)
266256

267257
ff = PositionalFeedForwardGoogle(d_model=self.embed_dim, d_ff=self.num_hidden_pos_ff,

models/Transformer_OpenAI.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import copy
2+
import math
3+
4+
import matplotlib.pyplot as plt
5+
import numpy as np
6+
import torch
7+
import torch.nn as nn
8+
import torch.nn.functional as F
9+
from torch.autograd import Variable
10+
11+
from utils.utils import clones
12+
13+
14+
class LayerNorm(nn.Module):
15+
def __init__(self, features, epsilon=1e-5):
16+
self.a_2 = nn.Parameter(torch.ones(features))
17+
self.b_2 = nn.Parameter(torch.zeros(features))
18+
self.epsilon = epsilon
19+
20+
def forward(self, x):
21+
mean = x.mean(-1, keepdim=True)
22+
std = x.std(-1, keepdim=True)
23+
return self.a_2 * (x - mean) / torch.sqrt(std + self.epsilon) + self.b_2
24+
25+
26+
class Embeddings(nn.Module):
27+
def __init__(self, embed_dim, vocab_size, keep_prob, padding_id, use_pretrained_embed, pretrained_weights):
28+
super(Embeddings, self).__init__()
29+
# Initialize embeddings
30+
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_id).cpu()
31+
if use_pretrained_embed:
32+
self.load_pretrained_weights()
33+
self.embed_drop = nn.Dropout(keep_prob)
34+
35+
def forward(self, input):
36+
x = self.embed_drop(self.embedding(input))
37+
out = x.sum(dim=2)
38+
return out
39+
40+
41+
class LanguageModelHead(nn.Module):
42+
def __init__(self):
43+
super(LanguageModelHead, self).__init__()
44+
45+
46+
class TransformerOpenAI:
47+
def __init__(self, args):
48+
super(TransformerOpenAI, self).__init__()
49+
50+
self.args_common = args["common_model_properties"]
51+
self.args_specific = args["transformer_openai"]
52+
53+
# Device
54+
self.device = self.args_common["device"]
55+
56+
# Input/Output dimensions
57+
self.vocab_size = self.args_common["vocab_size"]
58+
self.embed_dim = self.args_common["embed_dim"]
59+
self.num_class = self.args_common["num_class"]
60+
61+
# Embedding parameters
62+
self.padding_id = self.args_common["padding_id"]
63+
64+
# Condition parameters
65+
self.use_pretrained_embed = self.args_common["use_pretrained_embed"]
66+
67+
# Model/Context size
68+
self.d_model = self.args_specific["d_model"]
69+
70+
# Dropout probabilities for each individual part of the full model.
71+
self.keep_prob_embed = self.args_specific["keep_prob_embed"]
72+
73+
# Number of parallel attention layers for MultiHeadedAttention
74+
self.heads = self.args_specific["heads"]
75+
76+
# Number of layers in terms of Blocks
77+
self.num_layers = self.args_specific["num_layers"]
78+
79+
if self.transformer_type == "classifier":
80+
self.model = self.create_classifier_transformer()
81+
else:
82+
raise ValueError("Transformer can be created as classifier for now!")
83+
84+
def create_classifier_transformer(self):
85+
c = copy.deepcopy
86+
87+
embedding = Embeddings(self.embed_dim, self.vocab_size, self.keep_prob_embed, self.padding_id,
88+
self.use_pretrained_embed, self.pretrained_weights)
89+
90+

utils/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88
import torch
99
import torch.nn as nn
10+
import torch.nn.functional as F
1011

1112

1213
def save_vocabulary(vocab, path):
@@ -75,3 +76,10 @@ def subsequent_mask(size):
7576
# The Annotated Transformer = https://nlp.seas.hardvard.edu/2018/04/03.attention.html
7677
sm = np.triu(np.ones((1, size, size)), k=1).astype("uint8")
7778
return torch.from_numpy(sm) == 0
79+
80+
81+
def gelu(x):
82+
# Gaussian Error Linear Unit
83+
# Ref: https://github.com/pytorch/pytorch/issues/20464
84+
return 0.5 * x * (1 + torch.tanh(math.sqrt(math.pi / 2) * (x + 0.044715 * x ** 3)))
85+

0 commit comments

Comments
 (0)