Guitaricet/transformer.py

## transformer.py
class TransformerLayer(nn.Module):
    def __init__(self, dim=496, heads=4, ffn_dim=1984):
        super().__init__()
        self.attention = Attention(dim, heads=heads)

        self.fc = nn.Sequential(
            nn.Linear(dim, ffn_dim),
            nn.ReLU(),  # chose your favorite nonlinearity here
            nn.Linear(ffn_dim, dim),
        )
        self.norm = nn.LayerNorm(dim)

    def forward(self, x):
        _, _, dim = x.shape
        residual = x

        x = self.attention(x)
        x = self.norm(x + residual)

        residual = x
        x = self.fc(x)
        x = self.norm(x + residual)
        return x
	class TransformerLayer(nn.Module):
	def __init__(self, dim=496, heads=4, ffn_dim=1984):
	super().__init__()
	self.attention = Attention(dim, heads=heads)

	self.fc = nn.Sequential(
	nn.Linear(dim, ffn_dim),
	nn.ReLU(), # chose your favorite nonlinearity here
	nn.Linear(ffn_dim, dim),
	)
	self.norm = nn.LayerNorm(dim)

	def forward(self, x):
	_, _, dim = x.shape
	residual = x

	x = self.attention(x)
	x = self.norm(x + residual)

	residual = x
	x = self.fc(x)
	x = self.norm(x + residual)
	return x