goddoe/SelfAttention.py

## SelfAttention.py
import torch
import torch.nn as nn

class SelfAttention(nn.Module):

    def __init__(self, input_dim, output_dim, dropout=0.1):
        super(SelfAttention, self).__init__()
        self.q = nn.Linear(input_dim, output_dim)
        self.k = nn.Linear(input_dim, output_dim)
        self.v = nn.Linear(input_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # B x ... x S x D
        q = self.q(x)
        k = self.k(x)
        v = self.v(x)
        # (B x ... x S x D) @ (B x ... x D x S) => (B x ... x S x S)
        alpha = self.dropout(torch.softmax(q @ k.transpose(-2, -1) / np.sqrt(k.size(-1)), dim=-1))
        # (B x ... x S x S) @ (B x ... x S x D) => (B x ... x S x D)
        return alpha @ v
	import torch
	import torch.nn as nn

	class SelfAttention(nn.Module):

	def __init__(self, input_dim, output_dim, dropout=0.1):
	super(SelfAttention, self).__init__()
	self.q = nn.Linear(input_dim, output_dim)
	self.k = nn.Linear(input_dim, output_dim)
	self.v = nn.Linear(input_dim, output_dim)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	# B x ... x S x D
	q = self.q(x)
	k = self.k(x)
	v = self.v(x)
	# (B x ... x S x D) @ (B x ... x D x S) => (B x ... x S x S)
	alpha = self.dropout(torch.softmax(q @ k.transpose(-2, -1) / np.sqrt(k.size(-1)), dim=-1))
	# (B x ... x S x S) @ (B x ... x S x D) => (B x ... x S x D)
	return alpha @ v