catid/test_gmm_claude3.py

## test_gmm_claude3.py
# Collaboration between Claude-3 and GPT-4 to implement https://arxiv.org/pdf/2312.02116.pdf
# This is just the GMM decoder part of the model they propose (which is the new thing).
# This one was mainly generated by Claude-3.
# The AIs provided two implementations of the idea and revised eachothers' code.
# I tested that the unit tests pass but haven't tried it in a language model yet.

import torch
import torch.nn as nn
import torch.nn.functional as F

class ImprovedGMMParametersPrediction(nn.Module):
    def __init__(self, hidden_dim, output_dim, num_components):
        super().__init__()
        self.num_components = num_components

        # Initialize parameters prediction layers
        self.mu = nn.Linear(hidden_dim, output_dim * num_components)
        self.log_sigma = nn.Linear(hidden_dim, output_dim * num_components)
        self.logits_pi = nn.Linear(hidden_dim, num_components)

    def forward(self, x):
        batch_size, seq_length, _ = x.size()

        # Predict parameters and reshape appropriately
        mu = self.mu(x).view(batch_size, seq_length, self.num_components, -1)
        log_sigma = self.log_sigma(x).view(batch_size, seq_length, self.num_components, -1)
        logits_pi = self.logits_pi(x).view(batch_size, seq_length, self.num_components)

        # Softmax for mixing coefficients and exp for standard deviations
        pi = F.softmax(logits_pi, dim=-1)
        sigma = torch.exp(log_sigma)

        return mu, sigma, pi

class ImprovedGMMOutput(nn.Module):
    def __init__(self, hidden_dim, output_dim, num_mixtures):
        super().__init__()
        self.num_mixtures = num_mixtures

        # Parameter prediction layers
        self.fc_means = nn.Linear(hidden_dim, output_dim * num_mixtures)
        self.fc_scales = nn.Linear(hidden_dim, output_dim * num_mixtures)
        self.fc_weights = nn.Linear(hidden_dim, num_mixtures)

    def forward(self, x):
        batch_size, seq_length, _ = x.size()

        # Predict means, scales, and weights for the GMM
        means = self.fc_means(x).view(batch_size, seq_length, self.num_mixtures, -1)
        scales = F.softplus(self.fc_scales(x)).view(batch_size, seq_length, self.num_mixtures, -1)
        weights = F.softmax(self.fc_weights(x), dim=-1)

        return means, scales, weights

# Unit tests
from torch.testing import assert_allclose

def test_improved_gmm_parameter_prediction():
    batch_size = 2
    seq_length = 3
    hidden_dim = 4
    output_dim = 5
    num_components = 3

    model = ImprovedGMMParametersPrediction(hidden_dim, output_dim, num_components)
    x = torch.randn(batch_size, seq_length, hidden_dim)

    mu, sigma, pi = model(x)

    assert mu.shape == (batch_size, seq_length, num_components, output_dim)
    assert sigma.shape == (batch_size, seq_length, num_components, output_dim)
    assert pi.shape == (batch_size, seq_length, num_components)

    assert torch.allclose(pi.sum(dim=-1), torch.ones(batch_size, seq_length), rtol=1e-5, atol=1e-8)
    assert (sigma > 0).all()

def test_improved_gmm_output():
    batch_size = 2
    seq_length = 3
    hidden_dim = 4
    output_dim = 5
    num_mixtures = 3

    model = ImprovedGMMOutput(hidden_dim, output_dim, num_mixtures)
    x = torch.randn(batch_size, seq_length, hidden_dim)

    means, scales, weights = model(x)

    assert means.shape == (batch_size, seq_length, num_mixtures, output_dim)
    assert scales.shape == (batch_size, seq_length, num_mixtures, output_dim)
    assert weights.shape == (batch_size, seq_length, num_mixtures)

    assert torch.allclose(weights.sum(dim=-1), torch.ones(batch_size, seq_length), rtol=1e-5, atol=1e-8)
    assert (scales > 0).all()

if __name__ == "__main__":
    test_improved_gmm_parameter_prediction()
    test_improved_gmm_output()
    print("All tests passed!")
	# Collaboration between Claude-3 and GPT-4 to implement https://arxiv.org/pdf/2312.02116.pdf
	# This is just the GMM decoder part of the model they propose (which is the new thing).
	# This one was mainly generated by Claude-3.
	# The AIs provided two implementations of the idea and revised eachothers' code.
	# I tested that the unit tests pass but haven't tried it in a language model yet.

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	class ImprovedGMMParametersPrediction(nn.Module):
	def __init__(self, hidden_dim, output_dim, num_components):
	super().__init__()
	self.num_components = num_components

	# Initialize parameters prediction layers
	self.mu = nn.Linear(hidden_dim, output_dim * num_components)
	self.log_sigma = nn.Linear(hidden_dim, output_dim * num_components)
	self.logits_pi = nn.Linear(hidden_dim, num_components)

	def forward(self, x):
	batch_size, seq_length, _ = x.size()

	# Predict parameters and reshape appropriately
	mu = self.mu(x).view(batch_size, seq_length, self.num_components, -1)
	log_sigma = self.log_sigma(x).view(batch_size, seq_length, self.num_components, -1)
	logits_pi = self.logits_pi(x).view(batch_size, seq_length, self.num_components)

	# Softmax for mixing coefficients and exp for standard deviations
	pi = F.softmax(logits_pi, dim=-1)
	sigma = torch.exp(log_sigma)

	return mu, sigma, pi

	class ImprovedGMMOutput(nn.Module):
	def __init__(self, hidden_dim, output_dim, num_mixtures):
	super().__init__()
	self.num_mixtures = num_mixtures

	# Parameter prediction layers
	self.fc_means = nn.Linear(hidden_dim, output_dim * num_mixtures)
	self.fc_scales = nn.Linear(hidden_dim, output_dim * num_mixtures)
	self.fc_weights = nn.Linear(hidden_dim, num_mixtures)

	def forward(self, x):
	batch_size, seq_length, _ = x.size()

	# Predict means, scales, and weights for the GMM
	means = self.fc_means(x).view(batch_size, seq_length, self.num_mixtures, -1)
	scales = F.softplus(self.fc_scales(x)).view(batch_size, seq_length, self.num_mixtures, -1)
	weights = F.softmax(self.fc_weights(x), dim=-1)

	return means, scales, weights

	# Unit tests
	from torch.testing import assert_allclose

	def test_improved_gmm_parameter_prediction():
	batch_size = 2
	seq_length = 3
	hidden_dim = 4
	output_dim = 5
	num_components = 3

	model = ImprovedGMMParametersPrediction(hidden_dim, output_dim, num_components)
	x = torch.randn(batch_size, seq_length, hidden_dim)

	mu, sigma, pi = model(x)

	assert mu.shape == (batch_size, seq_length, num_components, output_dim)
	assert sigma.shape == (batch_size, seq_length, num_components, output_dim)
	assert pi.shape == (batch_size, seq_length, num_components)

	assert torch.allclose(pi.sum(dim=-1), torch.ones(batch_size, seq_length), rtol=1e-5, atol=1e-8)
	assert (sigma > 0).all()

	def test_improved_gmm_output():
	batch_size = 2
	seq_length = 3
	hidden_dim = 4
	output_dim = 5
	num_mixtures = 3

	model = ImprovedGMMOutput(hidden_dim, output_dim, num_mixtures)
	x = torch.randn(batch_size, seq_length, hidden_dim)

	means, scales, weights = model(x)

	assert means.shape == (batch_size, seq_length, num_mixtures, output_dim)
	assert scales.shape == (batch_size, seq_length, num_mixtures, output_dim)
	assert weights.shape == (batch_size, seq_length, num_mixtures)

	assert torch.allclose(weights.sum(dim=-1), torch.ones(batch_size, seq_length), rtol=1e-5, atol=1e-8)
	assert (scales > 0).all()

	if __name__ == "__main__":
	test_improved_gmm_parameter_prediction()
	test_improved_gmm_output()
	print("All tests passed!")