ma7dev/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Intro

It wasn't obvious on PyTorch's documentation of how to use PyTorch Profiler (as of today, 8/12/2021), so I have spent some time to understand how to use it and this gist contains a simple example to use.
Instructions


Install the required packages:

python>=1.9.0
torchvision>=0.10.0
numpy
matplotlib
tensorboard


Start tensorboard server

tensorboard --logdir=./logs


Run profiler.py

python profiler.py


After the program stops, open tensorboard on the displayed port (usually, http://localhost:6006) on your browser. (it might take some time to load the data.

Structure

main.py

Contains the actual source code for when to train the model
Profiling.py

Contains part of the original code that would be profiled (training stage).
Insights


It seems that the profiler adds additional delay (like 10-20%) to the current pipeline and it increases as you increase the number of active iterations to be profiled.
Profiler suggests some useful changes that impacts the performance (like increasing the batch size when GPU load is low or increase the number of workers when the Data Loader takes a lot of time to be preocessed)
Although PyTorch Profiler gave more insights and suggestion to understand the general usage of resources based on my model and train structure, it isn't obvious how I can use PyTorch Profiler even further to apply more optimizations.
I wish there was a more direct mapping between the nn.Modules/Components to what is being displayed. In addition, it would be cool if there is a graph showing the data flow and model structure with the amount of data transfer and time took.

Credits


Example used from: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
To understand how to use PyTorch Profiler: https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html
PyTorch Profiler's docs: https://pytorch.org/docs/stable/profiler.html


## main.py
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision
import torchvision.transforms as transforms

BATCH_SIZE = 32
EPOCHS = 10

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

dataiter = iter(trainloader)
images, labels = dataiter.next()
imshow(torchvision.utils.make_grid(images))
print(' '.join('%5s' % classes[labels[j]] for j in range(BATCH_SIZE)))

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

for epoch in range(EPOCHS):
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
print('Finished Training')

dataiter = iter(testloader)
images, labels = dataiter.next()
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))

outputs = net(images.to(device))
_, predicted = torch.max(outputs, 1)
print('Predicted: ', ' '.join('%5s' % classes[predicted[j]]
                              for j in range(4)))

correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)
        # calculate outputs by running images through the network
        outputs = net(images)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

with torch.no_grad():
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)
        outputs = net(images)
        _, predictions = torch.max(outputs, 1)
        # collect the correct predictions for each class
        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1


for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print("Accuracy for class {:5s} is: {:.1f} %".format(classname,
                                                         accuracy))

## profiling.py
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
import torch
import torchvision
import torchvision.transforms as transforms
import torch.profiler

WAIT = 1
WARMUP = 1
ACTIVE = 3
REPEAT = 2
BATCH_SIZE = 4
NUM_WORKERS = 2
RUNNER = f"{BATCH_SIZE}_{NUM_WORKERS}"

on_trace_ready = torch.profiler.tensorboard_trace_handler(f'{RUNNER}')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=NUM_WORKERS)


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    schedule=torch.profiler.schedule(
        wait=WAIT,
        warmup=WARMUP,
        active=ACTIVE,
        repeat=REPEAT),
    on_trace_ready=on_trace_ready,
    record_shapes=True,
    with_stack=True
) as p:
    for i, data in enumerate(trainloader, 0):
        if i >= (WAIT + WARMUP + ACTIVE) * REPEAT:
            break
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        p.step()
	import torch.optim as optim
	import torch.nn.functional as F
	import torch.nn as nn
	import numpy as np
	import matplotlib.pyplot as plt
	import torch
	import torchvision
	import torchvision.transforms as transforms

	BATCH_SIZE = 32
	EPOCHS = 10

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	print(device)

	transform = transforms.Compose(
	[transforms.ToTensor(),
	transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

	trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
	download=True, transform=transform)
	trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
	shuffle=True, num_workers=2)

	testset = torchvision.datasets.CIFAR10(root='./data', train=False,
	download=True, transform=transform)
	testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE,
	shuffle=False, num_workers=2)

	classes = ('plane', 'car', 'bird', 'cat',
	'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

	def imshow(img):
	img = img / 2 + 0.5 # unnormalize
	npimg = img.numpy()
	plt.imshow(np.transpose(npimg, (1, 2, 0)))
	plt.show()

	dataiter = iter(trainloader)
	images, labels = dataiter.next()
	imshow(torchvision.utils.make_grid(images))
	print(' '.join('%5s' % classes[labels[j]] for j in range(BATCH_SIZE)))

	class Net(nn.Module):
	def __init__(self):
	super().__init__()
	self.conv1 = nn.Conv2d(3, 6, 5)
	self.pool = nn.MaxPool2d(2, 2)
	self.conv2 = nn.Conv2d(6, 16, 5)
	self.fc1 = nn.Linear(16 * 5 * 5, 120)
	self.fc2 = nn.Linear(120, 84)
	self.fc3 = nn.Linear(84, 10)

	def forward(self, x):
	x = self.pool(F.relu(self.conv1(x)))
	x = self.pool(F.relu(self.conv2(x)))
	x = torch.flatten(x, 1) # flatten all dimensions except batch
	x = F.relu(self.fc1(x))
	x = F.relu(self.fc2(x))
	x = self.fc3(x)
	return x


	net = Net().to(device)
	criterion = nn.CrossEntropyLoss()
	optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

	for epoch in range(EPOCHS):
	for i, data in enumerate(trainloader, 0):
	inputs, labels = data[0].to(device), data[1].to(device)
	optimizer.zero_grad()
	outputs = net(inputs)
	loss = criterion(outputs, labels)
	loss.backward()
	optimizer.step()
	print('Finished Training')

	dataiter = iter(testloader)
	images, labels = dataiter.next()
	imshow(torchvision.utils.make_grid(images))
	print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))

	outputs = net(images.to(device))
	_, predicted = torch.max(outputs, 1)
	print('Predicted: ', ' '.join('%5s' % classes[predicted[j]]
	for j in range(4)))

	correct = 0
	total = 0
	with torch.no_grad():
	for data in testloader:
	images, labels = data[0].to(device), data[1].to(device)
	# calculate outputs by running images through the network
	outputs = net(images)
	# the class with the highest energy is what we choose as prediction
	_, predicted = torch.max(outputs.data, 1)
	total += labels.size(0)
	correct += (predicted == labels).sum().item()

	print('Accuracy of the network on the 10000 test images: %d %%' % (
	100 * correct / total))

	correct_pred = {classname: 0 for classname in classes}
	total_pred = {classname: 0 for classname in classes}

	with torch.no_grad():
	for data in testloader:
	images, labels = data[0].to(device), data[1].to(device)
	outputs = net(images)
	_, predictions = torch.max(outputs, 1)
	# collect the correct predictions for each class
	for label, prediction in zip(labels, predictions):
	if label == prediction:
	correct_pred[classes[label]] += 1
	total_pred[classes[label]] += 1


	for classname, correct_count in correct_pred.items():
	accuracy = 100 * float(correct_count) / total_pred[classname]
	print("Accuracy for class {:5s} is: {:.1f} %".format(classname,
	accuracy))