Skip to content

Instantly share code, notes, and snippets.

@djinn
Created August 10, 2023 04:18
Show Gist options
  • Save djinn/64cec4191cce49326e55035deb317ce7 to your computer and use it in GitHub Desktop.
Save djinn/64cec4191cce49326e55035deb317ce7 to your computer and use it in GitHub Desktop.
This defines a Transformer using minimum dependencies
import numpy as np
import math
# Define the Transformer model architecture
class Transformer:
def __init__(self, input_vocab_size, output_vocab_size, max_seq_length, d_model, num_heads, num_layers):
self.input_vocab_size = input_vocab_size
self.output_vocab_size = output_vocab_size
self.max_seq_length = max_seq_length
self.d_model = d_model
self.num_heads = num_heads
self.num_layers = num_layers
self.embedding = self._create_embedding()
self.encoder = self._create_encoder()
self.decoder = self._create_decoder()
self.final_layer = self._create_final_layer()
def _create_embedding(self):
return np.random.randn(self.max_seq_length, self.d_model)
def _create_encoder(self):
return [self._create_encoder_layer() for _ in range(self.num_layers)]
def _create_encoder_layer(self):
return {
'self_attention': self._create_multi_head_attention(),
'feed_forward': self._create_feed_forward()
}
def _create_decoder(self):
return [self._create_decoder_layer() for _ in range(self.num_layers)]
def _create_decoder_layer(self):
return {
'self_attention': self._create_multi_head_attention(),
'encoder_attention': self._create_multi_head_attention(),
'feed_forward': self._create_feed_forward()
}
def _create_multi_head_attention(self):
return {
'query_weights': np.random.randn(self.d_model, self.d_model),
'key_weights': np.random.randn(self.d_model, self.d_model),
'value_weights': np.random.randn(self.d_model, self.d_model)
}
def _create_feed_forward(self):
return {
'weights1': np.random.randn(self.d_model, 2048),
'bias1': np.random.randn(2048),
'weights2': np.random.randn(2048, self.d_model),
'bias2': np.random.randn(self.d_model)
}
def _create_final_layer(self):
return {
'weights': np.random.randn(self.d_model, self.output_vocab_size),
'bias': np.random.randn(self.output_vocab_size)
}
def _dot_product_attention(self, query, key, value):
scores = np.dot(query, key.T) / math.sqrt(self.d_model)
attention_weights = np.softmax(scores, axis=1)
output = np.dot(attention_weights, value)
return output, attention_weights
def encode(self, inputs):
embeddings = np.dot(inputs, self.embedding)
return embeddings
def decode(self, targets):
embeddings = np.dot(targets, self.embedding)
return embeddings
def forward(self, source_inputs, target_inputs):
# Encoding
encoder_output = self.encode(source_inputs)
# Decoding
decoder_output = self.decode(target_inputs)
return decoder_output
# Initialize the Transformer model
input_vocab_size = 10000
output_vocab_size = 8000
max_seq_length = 50
d_model = 512
num_heads = 8
num_layers = 6
batch_size = 32
transformer = Transformer(input_vocab_size, output_vocab_size, max_seq_length, d_model, num_heads, num_layers)
# Example inputs
source_inputs = np.random.randint(0, input_vocab_size, size=(batch_size, max_seq_length))
target_inputs = np.random.randint(0, output_vocab_size, size=(batch_size, max_seq_length))
# Forward pass
decoder_output = transformer.forward(source_inputs, target_inputs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment