Skip to content

Instantly share code, notes, and snippets.

View sthalles's full-sized avatar

Thalles Silva sthalles

  • Campinas, São Paulo
View GitHub Profile
class TransformerClassifier(nn.Module):
def __init__(self, config):
super().__init__()
self.encoder = TransformerEncoder(**config['encoder'])
self.linear = nn.Linear(config['encoder']['hidden_size'], config['classifier']['num_classes'])
def forward(self, x):
out = self.encoder(x)
return self.linear(out[:, 0, :])
class TransformerEncoder(nn.Module):
def __init__(self, *, vocab_size, hidden_size, max_position_embeddings, num_hidden_layers,
num_attention_heads, intermediate_size, hidden_dropout_prob):
super(TransformerEncoder, self).__init__()
self.embeddings = Embeddings(vocab_size, hidden_size, max_position_embeddings)
self.layers = nn.ModuleList(
TransformerEncoderLayer(hidden_size, num_attention_heads, intermediate_size, hidden_dropout_prob) for
_ in range(num_hidden_layers))
class TransformerEncoderLayer(nn.Module):
def __init__(self, hidden_size, num_attention_heads, hidden_size, hidden_dropout_prob):
super(TransformerEncoderLayer, self).__init__()
self.layer_norm_1 = nn.LayerNorm(hidden_size)
self.layer_norm_2 = nn.LayerNorm(hidden_size)
self.attention = MultiHeadAttention(hidden_size, num_attention_heads)
self.feed_forward = FeedForward(hidden_size, hidden_size, hidden_dropout_prob)
def forward(self, x):
hidden_state = self.layer_norm_1(x)
class FeedForward(nn.Module):
def __init__(self, hidden_size, intermediate_size, hidden_dropout_prob):
super(FeedForward, self).__init__()
self.linear1 = nn.Linear(hidden_size, intermediate_size)
self.linear2 = nn.Linear(intermediate_size, hidden_size)
self.activation = nn.GELU()
self.dropout = nn.Dropout(hidden_dropout_prob)
def forward(self, x):
x = self.linear1(x)
class MultiHeadAttention(nn.Module):
def __init__(self, embed_dim, num_heads):
super().__init__()
head_dim = embed_dim // num_heads
self.attn_heads = nn.ModuleList([AttentionHead(embed_dim, head_dim) for _ in range(num_heads)])
self.linear = nn.Linear(embed_dim, embed_dim)
def forward(self, x):
output = torch.cat([head(x) for head in self.attn_heads], dim=-1)
return self.linear(output)
class AttentionHead(nn.Module):
def __init__(self, embed_dim, head_dim):
super().__init__()
self.Q = nn.Linear(embed_dim, head_dim, bias=False)
self.K = nn.Linear(embed_dim, head_dim, bias=False)
self.V = nn.Linear(embed_dim, head_dim, bias=False)
def forward(self, x):
attn_output = scale_dot_product_attention(self.Q(x), self.K(x), self.V(x))
return attn_output
def scale_dot_product_attention(query, key, value):
dim = query.shape[-1]
scores = torch.bmm(query, key.transpose(1,2)) / math.sqrt(dim)
weights = torch.softmax(scores, dim=-1)
return torch.bmm(weights, value)
attention = torch.bmm(weights, value) # out shape [1, 25, 256]
weights = torch.softmax(scores, dim=-1) # normalize the similarities to sum up to one
for i in range(fake_input.shape[1]):
for j in range(fake_input.shape[1]):
torch.dot(fake_input[0, i], fake_input[0, j]) # store in a the scores tensor