ecsplendid/hinton_cnn.py

## hinton_cnn.py
// generated with gpt4-o, probably still buggy
// testing what Hinton spoke about here https://youtu.be/tP-4njhyGvo?si=9JCVwyiftFayc6mA&t=857
// i.e. 50% label noise on train
// CNN, ~10^8 params i.e. in overparam regime for MNIST, tried adding regularisation

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import os

# Define transformations for the training and validation sets
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Define hyperparameters
batch_size = 64
validation_split = 0.1

# Download and prepare the datasets
mnist_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Split the dataset into training and validation sets
train_size = int((1 - validation_split) * len(mnist_dataset))
val_size = len(mnist_dataset) - train_size
train_dataset, val_dataset = random_split(mnist_dataset, [train_size, val_size])

# Preprocess data
train_images = train_dataset.dataset.data[train_dataset.indices].view(-1, 28*28).float() / 255.0
train_labels = train_dataset.dataset.targets[train_dataset.indices]

# Shuffle the training set
indices = torch.randperm(len(train_labels))
train_images_shuffled = train_images[indices]
train_labels_shuffled = train_labels[indices]

# Create a dataset with shuffled images and partially shuffled labels
shuffled_train_dataset = TensorDataset(train_images_shuffled, train_labels_shuffled)

# Data loaders
train_loader = DataLoader(shuffled_train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the CNN model with Dropout
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(256*3*3, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 10)
        self.dropout = nn.Dropout(0.3)  # Add dropout with 30% probability

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, 2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, 2)
        x = torch.relu(self.conv3(x))
        x = torch.max_pool2d(x, 2)
        x = x.view(-1, 256*3*3)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)  # Apply dropout
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)  # Apply dropout
        x = self.fc3(x)
        return x

device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
model = CNN().to(device)

# Define loss function and optimizer with L2 regularization
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)  # Add weight decay for L2 regularization

num_epochs = 2000  # Reduce the number of epochs for testing purposes
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

def calculate_accuracy(loader, model):
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in loader:
            images = images.view(-1, 1, 28, 28).to(device)  # Reshape to original dimensions
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total

# Open a log file to write the loss data incrementally
with open('training_log_cnn.txt', 'w') as log_file:
    log_file.write('Epoch,Train Loss,Validation Loss,Train Accuracy,Validation Accuracy\n')

    for epoch in range(num_epochs):
        scaler = GradScaler()
        model.train()
        train_loss = 0
        for images, labels in train_loader:
            images = images.view(-1, 1, 28, 28).to(device)  # Reshape to original dimensions
            labels = labels.to(device)
            optimizer.zero_grad()
            with autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        train_losses.append(train_loss)

        train_accuracy = calculate_accuracy(train_loader, model)
        train_accuracies.append(train_accuracy)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images = images.view(-1, 1, 28, 28).to(device)  # Reshape to original dimensions
                labels = labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
        val_loss /= len(val_loader)
        val_losses.append(val_loss)

        val_accuracy = calculate_accuracy(val_loader, model)
        val_accuracies.append(val_accuracy)

        log_file.write(f'{epoch + 1},{train_loss:.4f},{val_loss:.4f},{train_accuracy:.2f},{val_accuracy:.2f}\n')
        log_file.flush()  # Ensure data is written incrementally
        print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, Validation Accuracy: {val_accuracy:.2f}%')

# Plotting losses and accuracies
fig, ax1 = plt.subplots()

ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.plot(range(1, num_epochs + 1), train_losses, label='Train Loss', color='tab:blue')
ax1.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss', color='tab:orange')
ax1.tick_params(axis='y')

ax2 = ax1.twinx()
ax2.set_ylabel('Accuracy')
ax2.plot(range(1, num_epochs + 1), train_accuracies, label='Train Accuracy', color='tab:green')
ax2.plot(range(1, num_epochs + 1), val_accuracies, label='Validation Accuracy', color='tab:red')
ax2.tick_params(axis='y')

fig.tight_layout()
fig.legend(loc='upper right', bbox_to_anchor=(1,1), bbox_transform=ax1.transAxes)
plt.title('Loss and Accuracy over Epochs')

# Save the plot as a JPG file
plt.savefig('training_plot.jpg', format='jpg', dpi=300)

plt.show()

# Calculate test accuracy
test_accuracy = calculate_accuracy(test_loader, model)

# Log the test accuracy
with open('training_log_cnn.txt', 'a') as log_file:  # Append to the log file
    log_file.write(f'Test Accuracy: {test_accuracy:.2f}%\n')
    print(f'Test Accuracy: {test_accuracy:.2f}%')
	// generated with gpt4-o, probably still buggy
	// testing what Hinton spoke about here https://youtu.be/tP-4njhyGvo?si=9JCVwyiftFayc6mA&t=857
	// i.e. 50% label noise on train
	// CNN, ~10^8 params i.e. in overparam regime for MNIST, tried adding regularisation

	import torch
	import torchvision
	import torchvision.transforms as transforms
	import torch.nn as nn
	import torch.optim as optim
	from torch.cuda.amp import autocast, GradScaler
	import matplotlib.pyplot as plt
	from torch.utils.data import DataLoader, TensorDataset, random_split
	import numpy as np
	import os

	# Define transformations for the training and validation sets
	transform = transforms.Compose([
	transforms.ToTensor(),
	transforms.Normalize((0.5,), (0.5,))
	])

	# Define hyperparameters
	batch_size = 64
	validation_split = 0.1

	# Download and prepare the datasets
	mnist_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
	test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)

	# Split the dataset into training and validation sets
	train_size = int((1 - validation_split) * len(mnist_dataset))
	val_size = len(mnist_dataset) - train_size
	train_dataset, val_dataset = random_split(mnist_dataset, [train_size, val_size])

	# Preprocess data
	train_images = train_dataset.dataset.data[train_dataset.indices].view(-1, 28*28).float() / 255.0
	train_labels = train_dataset.dataset.targets[train_dataset.indices]

	# Shuffle the training set
	indices = torch.randperm(len(train_labels))
	train_images_shuffled = train_images[indices]
	train_labels_shuffled = train_labels[indices]

	# Create a dataset with shuffled images and partially shuffled labels
	shuffled_train_dataset = TensorDataset(train_images_shuffled, train_labels_shuffled)

	# Data loaders
	train_loader = DataLoader(shuffled_train_dataset, batch_size=batch_size, shuffle=True)
	val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
	test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

	# Define the CNN model with Dropout
	class CNN(nn.Module):
	def __init__(self):
	super(CNN, self).__init__()
	self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
	self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
	self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
	self.fc1 = nn.Linear(25633, 1024)
	self.fc2 = nn.Linear(1024, 1024)
	self.fc3 = nn.Linear(1024, 10)
	self.dropout = nn.Dropout(0.3) # Add dropout with 30% probability

	def forward(self, x):
	x = torch.relu(self.conv1(x))
	x = torch.max_pool2d(x, 2)
	x = torch.relu(self.conv2(x))
	x = torch.max_pool2d(x, 2)
	x = torch.relu(self.conv3(x))
	x = torch.max_pool2d(x, 2)
	x = x.view(-1, 25633)
	x = torch.relu(self.fc1(x))
	x = self.dropout(x) # Apply dropout
	x = torch.relu(self.fc2(x))
	x = self.dropout(x) # Apply dropout
	x = self.fc3(x)
	return x

	device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
	model = CNN().to(device)

	# Define loss function and optimizer with L2 regularization
	criterion = nn.CrossEntropyLoss()
	optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4) # Add weight decay for L2 regularization

	num_epochs = 2000 # Reduce the number of epochs for testing purposes
	train_losses = []
	val_losses = []
	train_accuracies = []
	val_accuracies = []

	def calculate_accuracy(loader, model):
	correct = 0
	total = 0
	with torch.no_grad():
	for images, labels in loader:
	images = images.view(-1, 1, 28, 28).to(device) # Reshape to original dimensions
	labels = labels.to(device)
	outputs = model(images)
	_, predicted = torch.max(outputs, 1)
	total += labels.size(0)
	correct += (predicted == labels).sum().item()
	return 100 * correct / total

	# Open a log file to write the loss data incrementally
	with open('training_log_cnn.txt', 'w') as log_file:
	log_file.write('Epoch,Train Loss,Validation Loss,Train Accuracy,Validation Accuracy\n')

	for epoch in range(num_epochs):
	scaler = GradScaler()
	model.train()
	train_loss = 0
	for images, labels in train_loader:
	images = images.view(-1, 1, 28, 28).to(device) # Reshape to original dimensions
	labels = labels.to(device)
	optimizer.zero_grad()
	with autocast():
	outputs = model(images)
	loss = criterion(outputs, labels)
	scaler.scale(loss).backward()
	scaler.step(optimizer)
	scaler.update()
	train_loss += loss.item()
	train_loss /= len(train_loader)
	train_losses.append(train_loss)

	train_accuracy = calculate_accuracy(train_loader, model)
	train_accuracies.append(train_accuracy)

	model.eval()
	val_loss = 0
	with torch.no_grad():
	for images, labels in val_loader:
	images = images.view(-1, 1, 28, 28).to(device) # Reshape to original dimensions
	labels = labels.to(device)
	outputs = model(images)
	loss = criterion(outputs, labels)
	val_loss += loss.item()
	val_loss /= len(val_loader)
	val_losses.append(val_loss)

	val_accuracy = calculate_accuracy(val_loader, model)
	val_accuracies.append(val_accuracy)

	log_file.write(f'{epoch + 1},{train_loss:.4f},{val_loss:.4f},{train_accuracy:.2f},{val_accuracy:.2f}\n')
	log_file.flush() # Ensure data is written incrementally
	print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, Validation Accuracy: {val_accuracy:.2f}%')

	# Plotting losses and accuracies
	fig, ax1 = plt.subplots()

	ax1.set_xlabel('Epoch')
	ax1.set_ylabel('Loss')
	ax1.plot(range(1, num_epochs + 1), train_losses, label='Train Loss', color='tab:blue')
	ax1.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss', color='tab:orange')
	ax1.tick_params(axis='y')

	ax2 = ax1.twinx()
	ax2.set_ylabel('Accuracy')
	ax2.plot(range(1, num_epochs + 1), train_accuracies, label='Train Accuracy', color='tab:green')
	ax2.plot(range(1, num_epochs + 1), val_accuracies, label='Validation Accuracy', color='tab:red')
	ax2.tick_params(axis='y')

	fig.tight_layout()
	fig.legend(loc='upper right', bbox_to_anchor=(1,1), bbox_transform=ax1.transAxes)
	plt.title('Loss and Accuracy over Epochs')

	# Save the plot as a JPG file
	plt.savefig('training_plot.jpg', format='jpg', dpi=300)

	plt.show()

	# Calculate test accuracy
	test_accuracy = calculate_accuracy(test_loader, model)

	# Log the test accuracy
	with open('training_log_cnn.txt', 'a') as log_file: # Append to the log file
	log_file.write(f'Test Accuracy: {test_accuracy:.2f}%\n')
	print(f'Test Accuracy: {test_accuracy:.2f}%')