arthurdouillard/simple_selfatt.py

## simple_selfatt.py
class SelfAttention(nn.Module):
    def __init__(self, dim, num_heads=8):
        super().__init__()
        self.Wq = nn.Linear(dim, num_heads * dim, bias=False)
        self.Wk = nn.Linear(dim, num_heads * dim, bias=False)
        self.Wv = nn.Linear(dim, num_heads * dim, bias=False)
        self.Wo = nn.Linear(num_heads * dim, dim)

    def forward(self, x):
        # X is of shape (Batch size, number of tokens, embedding dimension)
        B, T, D = x.shape

        q = self.Wq(x) # (Batch size, number of tokens, number of heads * embedding dimension)
        k = self.Wk(x)
        v = self.Wv(x)

        a = torch.softmax(torch.bmm(q, k.permute(1, 2)) / math.sqrt(D), dim=-1)
        z = torch.bmm(a, v)

        return self.Wo(z)  # (Batch size, number of tokens, embedding dimension)
	class SelfAttention(nn.Module):
	def __init__(self, dim, num_heads=8):
	super().__init__()
	self.Wq = nn.Linear(dim, num_heads * dim, bias=False)
	self.Wk = nn.Linear(dim, num_heads * dim, bias=False)
	self.Wv = nn.Linear(dim, num_heads * dim, bias=False)
	self.Wo = nn.Linear(num_heads * dim, dim)

	def forward(self, x):
	# X is of shape (Batch size, number of tokens, embedding dimension)
	B, T, D = x.shape

	q = self.Wq(x) # (Batch size, number of tokens, number of heads * embedding dimension)
	k = self.Wk(x)
	v = self.Wv(x)

	a = torch.softmax(torch.bmm(q, k.permute(1, 2)) / math.sqrt(D), dim=-1)
	z = torch.bmm(a, v)

	return self.Wo(z) # (Batch size, number of tokens, embedding dimension)