Last active
June 29, 2022 00:20
-
-
Save buttercutter/b6f526c56e20f029d68e6f9041c3f5c0 to your computer and use it in GitHub Desktop.
GDAS : Searching for A Robust Neural Architecture in Four GPU Hours
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://github.com/D-X-Y/AutoDL-Projects/issues/99 | |
import torch | |
import torch.utils.data | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import torch.optim as optim | |
import torchvision | |
import torchvision.transforms as transforms | |
import tensorflow as tf | |
# import numpy as np | |
VISUALIZER = 0 | |
DEBUG = 0 | |
logdir = 'runs/gdas_experiment_1' | |
if VISUALIZER: | |
# https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html | |
from torch.utils.tensorboard import SummaryWriter | |
# from tensorboardX import SummaryWriter | |
# default `log_dir` is "runs" - we'll be more specific here | |
writer = SummaryWriter(logdir) | |
# https://github.com/szagoruyko/pytorchviz | |
from torchviz import make_dot | |
if DEBUG: | |
torch.autograd.set_detect_anomaly(True) | |
tf.debugging.experimental.enable_dump_debug_info(logdir, tensor_debug_mode="FULL_HEALTH", circular_buffer_size=-1) | |
USE_CUDA = torch.cuda.is_available() | |
# https://arxiv.org/pdf/1806.09055.pdf#page=12 | |
TEST_DATASET_RATIO = 0.5 # 50 percent of the dataset is dedicated for testing purpose | |
BATCH_SIZE = 4 | |
NUM_OF_IMAGE_CHANNELS = 3 # RGB | |
IMAGE_HEIGHT = 32 | |
IMAGE_WIDTH = 32 | |
NUM_OF_IMAGE_CLASSES = 10 | |
SIZE_OF_HIDDEN_LAYERS = 64 | |
NUM_EPOCHS = 1 | |
LEARNING_RATE = 0.025 | |
MOMENTUM = 0.9 | |
DECAY_FACTOR = 0.0001 # for keeping Ltrain and Lval within acceptable range | |
NUM_OF_CELLS = 6 | |
NUM_OF_MIXED_OPS = 4 | |
MIXED_OPS_TENSOR_SHAPE = 4 # shape of the computational kernel used inside each mixed ops | |
NUM_OF_PREVIOUS_CELLS_OUTPUTS = 2 # last_cell_output , second_last_cell_output | |
NUM_OF_NODES_IN_EACH_CELL = 5 # including the last node that combines the output from all 4 previous nodes | |
MAX_NUM_OF_CONNECTIONS_PER_NODE = NUM_OF_NODES_IN_EACH_CELL | |
NUM_OF_CHANNELS = 16 | |
INTERVAL_BETWEEN_REDUCTION_CELLS = 3 | |
PREVIOUS_PREVIOUS = 2 # (n-2) | |
REDUCTION_STRIDE = 2 | |
NORMAL_STRIDE = 1 | |
TAU_GUMBEL = 0.5 | |
EDGE_WEIGHTS_NETWORK_IN_SIZE = 5 | |
EDGE_WEIGHTS_NETWORK_OUT_SIZE = 2 | |
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html | |
transform = transforms.Compose( | |
[transforms.ToTensor(), | |
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) | |
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, | |
download=True, transform=transform) | |
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, | |
shuffle=True, num_workers=2) | |
valset = torchvision.datasets.CIFAR10(root='./data', train=False, | |
download=True, transform=transform) | |
valloader = torch.utils.data.DataLoader(valset, batch_size=BATCH_SIZE, | |
shuffle=False, num_workers=2) | |
classes = ('plane', 'car', 'bird', 'cat', | |
'deer', 'dog', 'frog', 'horse', 'ship', 'truck') | |
TRAIN_BATCH_SIZE = int(len(trainset) * (1 - TEST_DATASET_RATIO)) | |
# https://discordapp.com/channels/687504710118146232/703298739732873296/853270183649083433 | |
# for training for edge weights as well as internal NN function weights | |
class Edge(nn.Module): | |
def __init__(self): | |
super(Edge, self).__init__() | |
# https://stackoverflow.com/a/51027227/8776167 | |
# self.linear = nn.Linear(EDGE_WEIGHTS_NETWORK_IN_SIZE, EDGE_WEIGHTS_NETWORK_OUT_SIZE) | |
# https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html | |
self.weights = nn.Parameter(torch.zeros(1), | |
requires_grad=True) # for edge weights, not for internal NN function weights | |
# for approximate architecture gradient | |
self.f_weights = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True) | |
self.f_weights_backup = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True) | |
self.weight_plus = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True) | |
self.weight_minus = torch.zeros(MIXED_OPS_TENSOR_SHAPE, requires_grad=True) | |
def __freeze_w(self): | |
self.weights.requires_grad = False | |
def __unfreeze_w(self): | |
self.weights.requires_grad = True | |
def __freeze_f(self): | |
for param in self.f.parameters(): | |
param.requires_grad = False | |
def __unfreeze_f(self): | |
for param in self.f.parameters(): | |
param.requires_grad = True | |
# for NN functions internal weights training | |
def forward_f(self, x): | |
self.__unfreeze_f() | |
self.__freeze_w() | |
# inheritance in python classes and SOLID principles | |
# https://en.wikipedia.org/wiki/SOLID | |
# https://blog.cleancoder.com/uncle-bob/2020/10/18/Solid-Relevance.html | |
return self.f(x) | |
# self-defined initial NAS architecture, for supernet architecture edge weight training | |
def forward_edge(self, x): | |
self.__freeze_f() | |
self.__unfreeze_w() | |
# Refer to GDAS equations (5) and (6) | |
# if one_hot is already there, would summation be required given that all other entries are forced to 0 ? | |
# It's not required, but you don't know, which index is one hot encoded 1. | |
# https://pytorch.org/docs/stable/nn.functional.html#gumbel-softmax | |
# See also https://github.com/D-X-Y/AutoDL-Projects/issues/10#issuecomment-916619163 | |
gumbel = F.gumbel_softmax(x, tau=TAU_GUMBEL, hard=True) | |
chosen_edge = torch.argmax(gumbel, dim=0) # converts one-hot encoding into integer | |
return chosen_edge | |
def forward(self, x, types): | |
y_hat = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], requires_grad=False) | |
if USE_CUDA: | |
y_hat = y_hat.cuda() | |
if types == "f": | |
y_hat = self.forward_f(x) | |
elif types == "edge": | |
y_hat.requires_grad_() | |
y_hat = self.forward_edge(x) | |
return y_hat | |
class ConvEdge(Edge): | |
def __init__(self, stride): | |
super().__init__() | |
self.f = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=(3, 3), stride=(stride, stride), padding=1) | |
# Kaiming He weight Initialization | |
# https://medium.com/@shoray.goel/kaiming-he-initialization-a8d9ed0b5899 | |
nn.init.kaiming_uniform_(self.f.weight, mode='fan_in', nonlinearity='relu') | |
# class LinearEdge(Edge): | |
# def __init__(self): | |
# super().__init__() | |
# self.f = nn.Linear(84, 10) | |
class MaxPoolEdge(Edge): | |
def __init__(self, stride): | |
super().__init__() | |
self.f = nn.MaxPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True) | |
class AvgPoolEdge(Edge): | |
def __init__(self, stride): | |
super().__init__() | |
self.f = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1, ceil_mode=True) | |
class Skip(nn.Module): | |
def forward(self, x): | |
return x | |
class SkipEdge(Edge): | |
def __init__(self): | |
super().__init__() | |
self.f = Skip() | |
# to collect and manage different edges between 2 nodes | |
class Connection(nn.Module): | |
def __init__(self, stride): | |
super(Connection, self).__init__() | |
if USE_CUDA: | |
# creates distinct edges and references each of them in a list (self.edges) | |
# self.linear_edge = LinearEdge().cuda() | |
self.conv2d_edge = ConvEdge(stride).cuda() | |
self.maxpool_edge = MaxPoolEdge(stride).cuda() | |
self.avgpool_edge = AvgPoolEdge(stride).cuda() | |
self.skip_edge = SkipEdge().cuda() | |
else: | |
# creates distinct edges and references each of them in a list (self.edges) | |
# self.linear_edge = LinearEdge() | |
self.conv2d_edge = ConvEdge(stride) | |
self.maxpool_edge = MaxPoolEdge(stride) | |
self.avgpool_edge = AvgPoolEdge(stride) | |
self.skip_edge = SkipEdge() | |
# self.edges = [self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge] | |
# python list will break the computation graph, need to use nn.ModuleList as a differentiable python list | |
self.edges = nn.ModuleList([self.conv2d_edge, self.maxpool_edge, self.avgpool_edge, self.skip_edge]) | |
self.edge_weights = torch.zeros(NUM_OF_MIXED_OPS, requires_grad=True) | |
# self.edges_results = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], | |
# requires_grad=False) | |
# use linear transformation (weighted summation) to combine results from different edges | |
self.combined_feature_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], | |
requires_grad=False) | |
self.combined_edge_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], | |
requires_grad=True) | |
if USE_CUDA: | |
self.combined_feature_map = self.combined_feature_map.cuda() | |
self.combined_edge_map = self.combined_edge_map.cuda() | |
for e in range(NUM_OF_MIXED_OPS): | |
with torch.no_grad(): | |
self.edge_weights[e] = self.edges[e].weights | |
# https://stackoverflow.com/a/45024500/8776167 extracts the weights learned through NN functions | |
# self.f_weights[e] = list(self.edges[e].parameters()) | |
def reinit(self): | |
self.combined_feature_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], | |
requires_grad=False) | |
self.combined_edge_map = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], | |
requires_grad=True) | |
if USE_CUDA: | |
self.combined_feature_map = self.combined_feature_map.cuda() | |
self.combined_edge_map = self.combined_edge_map.cuda() | |
# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/ | |
# Tensorboard visualization requires a generic forward() function | |
def forward(self, x, types=None): | |
edges_results = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], | |
requires_grad=False) | |
if USE_CUDA: | |
edges_results = edges_results.cuda() | |
for e in range(NUM_OF_MIXED_OPS): | |
if types == "edge": | |
edges_results.requires_grad_() | |
edges_results = edges_results + self.edges[e].forward(x, types) | |
else: | |
with torch.no_grad(): | |
edges_results = edges_results + self.edges[e].forward(x, types) | |
return edges_results * DECAY_FACTOR | |
# to collect and manage multiple different connections between a particular node and its neighbouring nodes | |
class Node(nn.Module): | |
def __init__(self, stride): | |
super(Node, self).__init__() | |
# two types of output connections | |
# Type 1: (multiple edges) output connects to the input of the other intermediate nodes | |
# Type 2: (single edge) output connects directly to the final output node | |
# Type 1 | |
self.connections = nn.ModuleList([Connection(stride) for i in range(MAX_NUM_OF_CONNECTIONS_PER_NODE)]) | |
# Type 2 | |
# depends on PREVIOUS node's Type 1 output | |
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], | |
requires_grad=False) # for initialization | |
if USE_CUDA: | |
self.output = self.output.cuda() | |
def reinit(self): | |
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], | |
requires_grad=False) | |
if USE_CUDA: | |
self.output = self.output.cuda() | |
# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/ | |
# Tensorboard visualization requires a generic forward() function | |
def forward(self, x, node_num=0, types=None): | |
value = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], | |
requires_grad=False) | |
# not all nodes have same number of Type-1 output connection | |
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - node_num - 1): | |
y = self.connections[cc].forward(x, types) | |
# tensorflow does not like the use of self.variable inside def forward() unlike in Pytorch. | |
# Tensorflow prefers the use of a new intermediate variable instead of self.variable | |
if types == "f": | |
value = self.connections[cc].combined_feature_map | |
else: # "edge" | |
value.requires_grad_() | |
value = self.connections[cc].combined_edge_map | |
# combines all the feature maps from different mixed ops edges | |
value = value + y # Ltrain(w±, alpha) | |
# stores the addition result for next for loop index | |
if types == "f": | |
self.connections[cc].combined_feature_map = value | |
else: # "edge" | |
self.connections[cc].combined_edge_map = value | |
decayed_value = value * DECAY_FACTOR | |
if USE_CUDA: | |
decayed_value = decayed_value.cuda() | |
return decayed_value | |
# to manage all nodes within a cell | |
class Cell(nn.Module): | |
def __init__(self, stride): | |
super(Cell, self).__init__() | |
# all the coloured edges inside | |
# https://user-images.githubusercontent.com/3324659/117573177-20ea9a80-b109-11eb-9418-16e22e684164.png | |
# A single cell contains 'NUM_OF_NODES_IN_EACH_CELL' distinct nodes | |
# for the k-th node, we have (k+1) preceding nodes. | |
# Each intermediate state, 0->3 ('NUM_OF_NODES_IN_EACH_CELL-1'), | |
# is connected to each previous intermediate state | |
# as well as the output of the previous two cells, c_{k-2} and c_{k-1} (after a preprocessing layer). | |
# previous_previous_cell_output = c_{k-2} | |
# previous_cell_output = c{k-1} | |
self.nodes = nn.ModuleList([Node(stride) for i in range(NUM_OF_NODES_IN_EACH_CELL)]) | |
# just for variables initialization | |
self.previous_cell = 0 | |
self.previous_previous_cell = 0 | |
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], | |
requires_grad=False) | |
if USE_CUDA: | |
self.output = self.output.cuda() | |
def reinit(self): | |
self.output = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], | |
requires_grad=False) | |
if USE_CUDA: | |
self.output = self.output.cuda() | |
# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/ | |
# Tensorboard visualization requires a generic forward() function | |
def forward(self, x, x1, x2, c=0, types=None): | |
value = torch.zeros([BATCH_SIZE, NUM_OF_IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH], | |
requires_grad=False) | |
if types == "edge": | |
value.requires_grad_() | |
self.output.requires_grad_() | |
for n in range(NUM_OF_NODES_IN_EACH_CELL): | |
if types == "edge": | |
self.nodes[n].output.requires_grad_() | |
if c <= 1: | |
if n == 0: | |
# Uses datasets as input | |
# x = train_inputs | |
if USE_CUDA: | |
x = x.cuda() | |
# combines all the feature maps from different mixed ops edges | |
self.nodes[n].output = \ | |
self.nodes[n].forward(x, node_num=n, types=types) # Ltrain(w±, alpha) | |
else: | |
# Uses feature map output from previous neighbour nodes for further processing | |
for ni in range(n): | |
# nodes[ni] for previous nodes only | |
# connections[n-ni-1] for neighbour nodes only | |
if types == "f": | |
x = self.nodes[ni].connections[n - ni - 1].combined_feature_map | |
else: # "edge" | |
x = self.nodes[ni].connections[n - ni - 1].combined_edge_map | |
# combines all the feature maps from different mixed ops edges | |
self.nodes[n].output = self.nodes[n].output + \ | |
self.nodes[n].forward(x, node_num=n, types=types) # Ltrain(w±, alpha) | |
else: | |
if n == 0: | |
# Uses feature map output from previous neighbour cells for further processing | |
self.nodes[n].output = \ | |
self.nodes[n].forward(x1, node_num=n, types=types) + \ | |
self.nodes[n].forward(x2, node_num=n, types=types) # Ltrain(w±, alpha) | |
else: | |
# Uses feature map output from previous neighbour nodes for further processing | |
for ni in range(n): | |
# nodes[ni] for previous nodes only | |
# connections[n-ni-1] for neighbour nodes only | |
if types == "f": | |
x = self.nodes[ni].connections[n - ni - 1].combined_feature_map | |
else: # "edge" | |
x = self.nodes[ni].connections[n - ni - 1].combined_edge_map | |
# combines all the feature maps from different mixed ops edges | |
self.nodes[n].output = self.nodes[n].output + \ | |
self.nodes[n].forward(x, node_num=n, types=types) # Ltrain(w±, alpha) | |
# Uses feature map output from previous neighbour cells for further processing | |
self.nodes[n].output = self.nodes[n].output + \ | |
self.nodes[n].forward(x1, node_num=n, types=types) + \ | |
self.nodes[n].forward(x2, node_num=n, types=types) # Ltrain(w±, alpha) | |
# 'add' then 'concat' feature maps from different nodes | |
# needs to take care of tensor dimension mismatch | |
# See https://github.com/D-X-Y/AutoDL-Projects/issues/99#issuecomment-869100416 | |
# self.output = self.output + self.nodes[n].output | |
# tensorflow does not like the use of self.variable inside def forward() unlike in Pytorch. | |
# Tensorflow prefers the use of a new intermediate variable instead of self.variable | |
value = self.output | |
if USE_CUDA: | |
self.nodes[n].output = self.nodes[n].output.cuda() | |
value = value.cuda() | |
value = value + self.nodes[n].output | |
self.output = value | |
# to manage all nodes | |
class Graph(nn.Module): | |
def __init__(self): | |
super(Graph, self).__init__() | |
stride = 1 # just to initialize a variable | |
# for i in range(NUM_OF_CELLS): | |
# if i % INTERVAL_BETWEEN_REDUCTION_CELLS == 0: | |
# stride = REDUCTION_STRIDE # to emulate reduction cell by using normal cell with stride=2 | |
# else: | |
# stride = NORMAL_STRIDE # normal cell | |
self.cells = nn.ModuleList([Cell(stride) for i in range(NUM_OF_CELLS)]) | |
self.linears = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES) | |
self.softmax = nn.Softmax(1) | |
self.Lval_backup = torch.FloatTensor(0) | |
if USE_CUDA: | |
self.Lval_backup = self.Lval_backup.cuda() | |
def reinit(self): | |
# See https://discuss.pytorch.org/t/tensorboard-issue-with-self-defined-forward-function/140628/20?u=promach | |
for c in range(NUM_OF_CELLS): | |
self.cells[c].reinit() | |
for n in range(NUM_OF_NODES_IN_EACH_CELL): | |
self.cells[c].nodes[n].reinit() | |
# not all nodes have same number of Type-1 output connection | |
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1): | |
self.cells[c].nodes[n].connections[cc].reinit() | |
def print_debug(self): | |
for c in range(NUM_OF_CELLS): | |
for n in range(NUM_OF_NODES_IN_EACH_CELL): | |
# not all nodes have same number of Type-1 output connection | |
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1): | |
for e in range(NUM_OF_MIXED_OPS): | |
if DEBUG: | |
print("c = ", c, " , n = ", n, " , cc = ", cc, " , e = ", e) | |
print("graph.cells[", c, "].nodes[", n, "].connections[", cc, | |
"].combined_feature_map.grad_fn = ", | |
self.cells[c].nodes[n].connections[cc].combined_feature_map.grad_fn) | |
print("graph.cells[", c, "].output.grad_fn = ", | |
self.cells[c].output.grad_fn) | |
print("graph.cells[", c, "].nodes[", n, "].output.grad_fn = ", | |
self.cells[c].nodes[n].output.grad_fn) | |
if VISUALIZER == 0: | |
self.cells[c].nodes[n].output.retain_grad() | |
print("gradwalk(graph.cells[", c, "].nodes[", n, "].output.grad_fn)") | |
# gradwalk(graph.cells[c].nodes[n].output.grad_fn) | |
if DEBUG: | |
print("graph.cells[", c, "].output.grad_fn = ", | |
self.cells[c].output.grad_fn) | |
if VISUALIZER == 0: | |
self.cells[c].output.retain_grad() | |
print("gradwalk(graph.cells[", c, "].output.grad_fn)") | |
# gradwalk(graph.cells[c].output.grad_fn) | |
# See https://www.reddit.com/r/pytorch/comments/rtlvtk/tensorboard_issue_with_selfdefined_forward/ | |
# Tensorboard visualization requires a generic forward() function | |
def forward(self, x, types=None): | |
# train_inputs = x | |
# https://www.reddit.com/r/learnpython/comments/no7btk/how_to_carry_extra_information_across_dag/ | |
# https://docs.python.org/3/tutorial/datastructures.html | |
# generates a supernet consisting of 'NUM_OF_CELLS' cells | |
# each cell contains of 'NUM_OF_NODES_IN_EACH_CELL' nodes | |
# refer to PNASNet https://arxiv.org/pdf/1712.00559.pdf#page=5 for the cell arrangement | |
# https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html | |
# encodes the cells and nodes arrangement in the multigraph | |
outputs1 = 0 # just for initialization, no special meaning | |
for c in range(NUM_OF_CELLS): | |
x1 = self.cells[c - 1].output | |
x2 = self.cells[c - PREVIOUS_PREVIOUS].output | |
self.cells[c].forward(x, x1, x2, c, types=types) | |
output_tensor = self.cells[NUM_OF_CELLS - 1].output | |
output_tensor = output_tensor.view(output_tensor.shape[0], -1) | |
if USE_CUDA: | |
output_tensor = output_tensor.cuda() | |
if DEBUG and VISUALIZER == 0: | |
print("gradwalk(output_tensor.grad_fn)") | |
# gradwalk(output_tensor.grad_fn) | |
if USE_CUDA: | |
outputs1 = self.linears(output_tensor).cuda() | |
else: | |
outputs1 = self.linears(output_tensor) | |
outputs1 = self.softmax(outputs1) | |
if USE_CUDA: | |
outputs1 = outputs1.cuda() | |
return outputs1 | |
total_grad_out = [] | |
total_grad_in = [] | |
def hook_fn_backward(module, grad_input, grad_output): | |
print(module) # for distinguishing module | |
# In order to comply with the order back-propagation, let's print grad_output | |
print('grad_output', grad_output) | |
# Reprint grad_input | |
print('grad_input', grad_input) | |
# Save to global variables | |
total_grad_in.append(grad_input) | |
total_grad_out.append(grad_output) | |
# for tracking the gradient back-propagation operation | |
def gradwalk(x, _depth=0): | |
if hasattr(x, 'grad'): | |
x = x.grad | |
if hasattr(x, 'next_functions'): | |
for fn in x.next_functions: | |
print(' ' * _depth + str(fn)) | |
gradwalk(fn[0], _depth + 1) | |
# Function to Convert to ONNX | |
def Convert_ONNX(model, model_input): | |
# Export the model | |
torch.onnx.export(model, # model being run | |
model_input, # model input (or a tuple for multiple inputs) | |
"gdas.onnx", # where to save the model | |
export_params=True, # store the trained parameter weights inside the model file | |
opset_version=10, # the ONNX version to export the model to | |
do_constant_folding=True, # whether to execute constant folding for optimization | |
input_names=['modelInput'], # the model's input names | |
output_names=['modelOutput'], # the model's output names | |
dynamic_axes={'modelInput': {0: 'batch_size'}, # variable length axes | |
'modelOutput': {0: 'batch_size'}}) | |
print(" ") | |
print('Model has been converted to ONNX') | |
# https://translate.google.com/translate?sl=auto&tl=en&u=http://khanrc.github.io/nas-4-darts-tutorial.html | |
def train_NN(forward_pass_only): | |
if DEBUG: | |
print("Entering train_NN(), forward_pass_only = ", forward_pass_only) | |
graph = Graph() | |
if USE_CUDA: | |
graph = graph.cuda() | |
if DEBUG: | |
modules = graph.named_children() | |
print("modules = ", modules) | |
if VISUALIZER == 0: | |
# Tensorboard does not like backward hook | |
for name, module in graph.named_modules(): | |
module.register_full_backward_hook(hook_fn_backward) | |
criterion = nn.CrossEntropyLoss() | |
# criterion = nn.BCELoss() | |
optimizer1 = optim.SGD(graph.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM) | |
# just for initialization, no special meaning | |
Ltrain = 0 | |
NN_input = 0 | |
NN_output = torch.tensor(0) | |
NN_train_labels = 0 | |
for train_data, val_data in (zip(trainloader, valloader)): | |
NN_input, NN_train_labels = train_data | |
# val_inputs, val_labels = val_data | |
if USE_CUDA: | |
NN_input = NN_input.cuda() | |
NN_train_labels = NN_train_labels.cuda() | |
# normalize inputs | |
NN_input = NN_input / 255 | |
if forward_pass_only == 0: | |
# zero the parameter gradients | |
optimizer1.zero_grad() | |
# do train thing for internal NN function weights | |
NN_output = graph.forward(NN_input, types="f") | |
if VISUALIZER: | |
# netron https://docs.microsoft.com/zh-cn/windows/ai/windows-ml/tutorials/pytorch-convert-model | |
Convert_ONNX(graph, NN_input) | |
# tensorboard | |
writer.add_graph(graph, NN_input) | |
writer.close() | |
# graphviz | |
make_dot(NN_output.mean(), params=dict(graph.named_parameters())).render("gdas_torchviz", format="svg") | |
if DEBUG: | |
print("outputs1.size() = ", NN_output.size()) | |
print("train_labels.size() = ", NN_train_labels.size()) | |
Ltrain = criterion(NN_output, NN_train_labels) | |
Ltrain = Ltrain.requires_grad_() | |
Ltrain.retain_grad() | |
if forward_pass_only == 0: | |
# backward pass | |
if DEBUG: | |
Ltrain.register_hook(lambda x: print(x)) | |
Ltrain.backward(retain_graph=True) | |
if DEBUG: | |
print("starts to print graph.named_parameters()") | |
for name, param in graph.named_parameters(): | |
print(name, param.grad) | |
print("finished printing graph.named_parameters()") | |
print("starts gradwalk()") | |
# gradwalk(Ltrain.grad_fn) | |
print("finished gradwalk()") | |
optimizer1.step() | |
# graph.reinit() | |
else: | |
# graph.reinit() | |
# no need to save model parameters for next epoch | |
return Ltrain | |
# DARTS's approximate architecture gradient. Refer to equation (8) | |
# needs to save intermediate trained model for Ltrain | |
path = './model.pth' | |
torch.save(graph, path) | |
if DEBUG: | |
print("after multiple for-loops") | |
return Ltrain | |
def train_architecture(forward_pass_only, train_or_val='val'): | |
if DEBUG: | |
print("Entering train_architecture(), forward_pass_only = ", forward_pass_only, " , train_or_val = ", | |
train_or_val) | |
graph = Graph() | |
if USE_CUDA: | |
graph = graph.cuda() | |
criterion = nn.CrossEntropyLoss() | |
optimizer2 = optim.SGD(graph.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM) | |
# just for initialization, no special meaning | |
Lval = 0 | |
train_inputs = 0 | |
train_labels = 0 | |
val_inputs = 0 | |
val_labels = 0 | |
if forward_pass_only == 0: | |
# do train thing for architecture edge weights | |
graph.train() | |
# zero the parameter gradients | |
optimizer2.zero_grad() | |
if DEBUG: | |
print("before multiple for-loops") | |
for train_data, val_data in (zip(trainloader, valloader)): | |
train_inputs, train_labels = train_data | |
val_inputs, val_labels = val_data | |
if USE_CUDA: | |
train_inputs = train_inputs.cuda() | |
train_labels = train_labels.cuda() | |
val_inputs = val_inputs.cuda() | |
val_labels = val_labels.cuda() | |
# normalize inputs | |
train_inputs = train_inputs / 255 | |
val_inputs = val_inputs / 255 | |
# forward pass | |
if train_or_val == 'val': | |
graph.forward(val_inputs, types="edge") # Lval(w*, alpha) | |
else: | |
graph.forward(train_inputs, types="edge") # Lval(w*, alpha) | |
output2_tensor = graph.cells[NUM_OF_CELLS - 1].output | |
output2_tensor = output2_tensor.view(output2_tensor.shape[0], -1) | |
output2_tensor = output2_tensor * DECAY_FACTOR | |
if USE_CUDA: | |
output2_tensor = output2_tensor.cuda() | |
if USE_CUDA: | |
m_linear = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES).cuda() | |
else: | |
m_linear = nn.Linear(NUM_OF_IMAGE_CHANNELS * IMAGE_HEIGHT * IMAGE_WIDTH, NUM_OF_IMAGE_CLASSES) | |
outputs2 = m_linear(output2_tensor) | |
if USE_CUDA: | |
outputs2 = outputs2.cuda() | |
if DEBUG: | |
print("outputs2.size() = ", outputs2.size()) | |
print("val_labels.size() = ", val_labels.size()) | |
print("train_labels.size() = ", train_labels.size()) | |
if train_or_val == 'val': | |
Lval = criterion(outputs2, val_labels) | |
else: | |
Lval = criterion(outputs2, train_labels) | |
Lval = Lval.requires_grad_() | |
Lval.retain_grad() | |
if forward_pass_only == 0: | |
# backward pass | |
Lval.backward(retain_graph=True) | |
# stores a copy of Lval for later usage | |
graph.Lval_backup = Lval | |
if DEBUG: | |
for name, param in graph.named_parameters(): | |
print(name, param.grad) | |
optimizer2.step() | |
else: | |
# no need to save model parameters for next epoch | |
return Lval | |
# needs to save intermediate trained model for Lval | |
path = './model.pth' | |
torch.save(graph, path) | |
# Lval is overwritten by function calls to train_architecture() of Ltrain_plus and Ltrain_minus | |
Lval = graph.Lval_backup | |
# DARTS's approximate architecture gradient. Refer to equation (8) and https://i.imgur.com/81JFaWc.png | |
sigma = LEARNING_RATE | |
epsilon = 0.01 / torch.norm(Lval) | |
# replaces f_weights with weight_plus before NN training | |
for c in range(NUM_OF_CELLS): | |
for n in range(NUM_OF_NODES_IN_EACH_CELL): | |
# not all nodes have same number of Type-1 output connection | |
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1): | |
for e in range(NUM_OF_MIXED_OPS): | |
EE = graph.cells[c].nodes[n].connections[cc].edges[e] | |
for w in graph.cells[c].nodes[n].connections[cc].edges[e].f.parameters(): | |
w = w + epsilon * Lval | |
# test NN to obtain loss | |
Ltrain_plus = train_architecture(forward_pass_only=1, train_or_val='train') | |
# replaces f_weights with weight_minus before NN training | |
for c in range(NUM_OF_CELLS): | |
for n in range(NUM_OF_NODES_IN_EACH_CELL): | |
# not all nodes have same number of Type-1 output connection | |
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1): | |
for e in range(NUM_OF_MIXED_OPS): | |
EE = graph.cells[c].nodes[n].connections[cc].edges[e] | |
for w in graph.cells[c].nodes[n].connections[cc].edges[e].f.parameters(): | |
w = w - 2 * epsilon * Lval | |
# test NN to obtain loss | |
Ltrain_minus = train_architecture(forward_pass_only=1, train_or_val='train') | |
# Restores original f_weights | |
for c in range(NUM_OF_CELLS): | |
for n in range(NUM_OF_NODES_IN_EACH_CELL): | |
# not all nodes have same number of Type-1 output connection | |
for cc in range(MAX_NUM_OF_CONNECTIONS_PER_NODE - n - 1): | |
for e in range(NUM_OF_MIXED_OPS): | |
EE = graph.cells[c].nodes[n].connections[cc].edges[e] | |
for w in graph.cells[c].nodes[n].connections[cc].edges[e].f.parameters(): | |
w = w + epsilon * Lval | |
if DEBUG: | |
print("after multiple for-loops") | |
L2train_Lval = (Ltrain_plus - Ltrain_minus) / (2 * epsilon) | |
return Lval - sigma * L2train_Lval | |
if __name__ == "__main__": | |
run_num = 0 | |
not_converged = 1 | |
while not_converged: | |
print("run_num = ", run_num) | |
ltrain = train_NN(forward_pass_only=0) | |
print("Finished train_NN()") | |
if VISUALIZER or DEBUG: | |
if run_num > 1: | |
break # visualizer does not need more than a single run | |
# 'train_or_val' to differentiate between using training dataset and validation dataset | |
lval = train_architecture(forward_pass_only=0, train_or_val='val') | |
print("Finished train_architecture()") | |
print("lval = ", lval, " , ltrain = ", ltrain) | |
not_converged = (lval > 0.01) or (ltrain > 0.01) | |
run_num = run_num + 1 | |
# do test thing | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/home/phung/PycharmProjects/venv/py39/bin/python /home/phung/PycharmProjects/beginner_tutorial/gdas.py | |
Files already downloaded and verified | |
Files already downloaded and verified | |
run_num = 0 | |
[W python_anomaly_mode.cpp:104] Warning: Error detected in CudnnConvolutionBackward0. Traceback of forward call that caused the error: | |
File "/home/phung/PycharmProjects/beginner_tutorial/gdas.py", line 883, in <module> | |
ltrain = train_NN(forward_pass_only=0) | |
File "/home/phung/PycharmProjects/beginner_tutorial/gdas.py", line 625, in train_NN | |
NN_output = graph.forward(NN_input, types="f") | |
File "/home/phung/PycharmProjects/beginner_tutorial/gdas.py", line 507, in forward | |
self.cells[c].forward(x, x1, x2, c, types=types) | |
File "/home/phung/PycharmProjects/beginner_tutorial/gdas.py", line 396, in forward | |
self.nodes[n-1].forward(x, types=types) # Ltrain(w±, alpha) | |
File "/home/phung/PycharmProjects/beginner_tutorial/gdas.py", line 297, in forward | |
y = self.connections[cc].forward(x, types) | |
File "/home/phung/PycharmProjects/beginner_tutorial/gdas.py", line 258, in forward | |
edges_results = edges_results + self.edges[e].forward(x, types) | |
File "/home/phung/PycharmProjects/beginner_tutorial/gdas.py", line 145, in forward | |
y_hat = self.forward_f(x) | |
File "/home/phung/PycharmProjects/beginner_tutorial/gdas.py", line 121, in forward_f | |
return self.f(x) | |
File "/home/phung/PycharmProjects/venv/py39/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl | |
return forward_call(*input, **kwargs) | |
File "/home/phung/PycharmProjects/venv/py39/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 446, in forward | |
return self._conv_forward(input, self.weight, self.bias) | |
File "/home/phung/PycharmProjects/venv/py39/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 442, in _conv_forward | |
return F.conv2d(input, weight, bias, self.stride, | |
(function _print_stack) | |
Traceback (most recent call last): | |
File "/home/phung/PycharmProjects/beginner_tutorial/gdas.py", line 883, in <module> | |
ltrain = train_NN(forward_pass_only=0) | |
File "/home/phung/PycharmProjects/beginner_tutorial/gdas.py", line 651, in train_NN | |
Ltrain.backward(retain_graph=True) | |
File "/home/phung/PycharmProjects/venv/py39/lib/python3.9/site-packages/torch/_tensor.py", line 307, in backward | |
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) | |
File "/home/phung/PycharmProjects/venv/py39/lib/python3.9/site-packages/torch/autograd/__init__.py", line 154, in backward | |
Variable._execution_engine.run_backward( | |
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [3, 3, 3, 3]] is at version 3; expected version 2 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck! | |
Process finished with exit code 1 |
@yongen9696 No, search for functions definitions of def forward_f(self, x):
and def forward_edge(self, x):
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
missing forward in your Graph()?