Skip to content

Instantly share code, notes, and snippets.

@rish-16
Last active March 29, 2022 06:16
Show Gist options
  • Save rish-16/30a339d1c5a43078056d98d2c632ff15 to your computer and use it in GitHub Desktop.
Save rish-16/30a339d1c5a43078056d98d2c632ff15 to your computer and use it in GitHub Desktop.
CS4243 PyTorch Snippets
import torch
import torch.nn as nn
import torch.nn.functional as F
"""
Creating tensors
"""
a = torch.rand(...) # returns a torch.Tensor
b = torch.LongTensor(10).random_(0, 2) # 10-dim vector from [0, 1]
"""
Network template
"""
class Network(nn.Module):
def __init__(self):
super().__init__()
pass
def forward(self, x):
pass
"""
batch training loop
"""
for epoch in in range(EPOCHS):
num_batches = 0
shuffled_indices=torch.randperm(60000)
running_loss = 0
for i in range(0, DATASETSIZE, BATCHSIZE):
idx = shuffled_indices[count:count+bs]
idx = torch.LongTensor(DATASETSIZE).random_(BATCHSIZE)
minibatch_data = train_data[idx]
minibatch_labels = trian_labels[idx]
inputs = minibatch_data.view(bs, INPUTSIZE)
inputs.requires_grad_()
pred = model(inputs)
loss = criteron(pred, minibatch_labels)
running_loss = loss.detach().item()
num_batches += 1
epoch_loss = running_loss / num_batches
"""
Testing model
"""
def eval_on_test_set(model, test_data, test_label):
running_error=0
num_batches=0
for i in range(0,DATASETSIZE, BATCHSIZE):
inputs = test_data[i:i+BATCHSIZE].unsqueeze(dim=1)
minibatch_label = test_label[i:i+BATCHSIZE]
scores = model(inputs)
error = utils.get_error( scores , minibatch_label)
running_error += error.item()
num_batches+=1
total_error = running_error/num_batches
print( 'error rate on test set =', total_error*100 ,'percent')
def get_accuracy(scores, labels):
# use within the batched training loop to get the batch accuracy
num_data = scores.size(0)
predicted_labels = scores.argmax(dim=1)
indicator = (predicted_labels == labels)
num_correct = indicator.sum()
accuracy = 100*num_correct.float()/num_data
return accuracy
"""
One-hot encoding
"""
def index_to_onehot(labels, num_classes=10):
"""
convert index label to one hot labels
Inputs:
labels: Integer Tensor of length N, e.g., [0, 1, 2, 4, 3]
num_classes: the number of classes, e.g., 5
Output:
Tensor: onehot_labels of size [N, num_classes]
a matrix that contains one-hot label for each sample:
e.g., [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 1],
[0, 0, 0, 1, 0]
]
"""
num_samples = len(labels)
onehot = torch.zeros(num_samples, num_classes)
onehot[torch.arange(num_samples), labels] = 1
return onehot
"""
Soft-label CrossEntropy
Only when final layer does not contain Softmax
"""
score = net(x)
prob = torch.softmax(score, dim=-1)
loss = -(prob.log() * y).sum(dim=-1).mean()
@rish-16
Copy link
Author

rish-16 commented Mar 28, 2022

VGG architecture

class VGG(nn.Module):
    def __init__(self):
        super(VGG_convnet, self).__init__()

        # block 1:         3 x 32 x 32 --> 64 x 16 x 16        
        self.conv1a = nn.Conv2d(3,   64,  kernel_size=3, padding=1 )
        self.conv1b = nn.Conv2d(64,  64,  kernel_size=3, padding=1 )
        self.pool1  = nn.MaxPool2d(2,2)

        # block 2:         64 x 16 x 16 --> 128 x 8 x 8
        self.conv2a = nn.Conv2d(64, 128, (3,3), padding=1)
        self.conv2b = nn.Conv2d(128, 128, (3,3), padding=1)
        self.pool2  = nn.MaxPool2d(2,2)

        # block 3:         128 x 8 x 8 --> 256 x 4 x 4        
        self.conv3a = nn.Conv2d(128, 256, (3,3), padding=1)
        self.conv3b = nn.Conv2d(256, 256, (3,3), padding=1)
        self.pool3  = nn.MaxPool2d((2,2))
        
        #block 4:          256 x 4 x 4 --> 512 x 2 x 2
        self.conv4a = nn.Conv2d(256, 512, (3,3), padding=1)
        self.pool4  =  nn.MaxPool2d((2,2))

        # linear layers:   512 x 2 x 2 --> 2048 --> 4096 --> 4096 --> 10
        self.linear1 = nn.Linear(2048, 4096)
        self.linear2 = nn.Linear(4096, 4096)
        self.linear3 = nn.Linear(4096, 10)


    def forward(self, x):

        # block 1:         3 x 32 x 32 --> 64 x 16 x 16
        x = self.conv1a(x)
        x = torch.relu(x)
        x = self.conv1b(x)
        x = torch.relu(x)
        x = self.pool1(x)

        # block 2:         64 x 16 x 16 --> 128 x 8 x 8
        x = self.conv2a(x)
        x = torch.relu(x)
        x = self.conv2b(x)
        x = torch.relu(x)
        x = self.pool2(x)

        # block 3:         128 x 8 x 8 --> 256 x 4 x 4
        x = self.conv3a(x)
        x = torch.relu(x)
        x = self.conv3b(x)
        x = torch.relu(x)
        x = self.pool3(x)

        #block 4:          256 x 4 x 4 --> 512 x 2 x 2
        x = self.conv4a(x)
        x = torch.relu(x)
        x = self.pool4(x)

        # linear layers:   512 x 2 x 2 --> 2048 --> 4096 --> 4096 --> 10
        x = x.view(-1, 2048)
        x = self.linear1(x)
        x = torch.relu(x)
        x = self.linear2(x)
        x = torch.relu(x)
        x = self.linear3(x) 
        
        return x

@rish-16
Copy link
Author

rish-16 commented Mar 28, 2022

Non-max Suppression

def nms(dets, thresh):
    '''
    dets is a numpy array : num_dets, 6
    The detections are already in sorted order and so can be used directly.
    '''
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    z1 = dets[:, 2]
    x2 = dets[:, 3]
    y2 = dets[:, 4]
    z2 = dets[:, 5]

    volume = (x2 - x1 + 1) * (y2 - y1 + 1) * (z2 - z1 + 1)
    order = torch.arange(dets.size(0))  # The boxes are in sorted order

    keep = []
    while order.size(0) > 0:
        i = order[0]  # pick maximum iou box
        keep.append(i)
        xx1 = torch.max(x1[i], x1[order[1:]])
        yy1 = torch.max(y1[i], y1[order[1:]])
        zz1 = torch.max(z1[i], z1[order[1:]])
        xx2 = torch.max(x2[i], x2[order[1:]])
        yy2 = torch.max(y2[i], y2[order[1:]])
        zz2 = torch.max(z2[i], z2[order[1:]])

        w = torch.max(torch.as_tensor(0.0), xx2 - xx1 + 1)  # maximum width
        h = torch.max(torch.as_tensor(0.0), yy2 - yy1 + 1)  # maxiumum height
        l = torch.max(torch.as_tensor(0.0), zz2 - zz1 + 1)  # maxiumum length
        inter = w * h * l
        ovr = inter.float() / (volume[i] + volume[order[1:]] - inter).float()

        inds = torch.where(ovr > thresh)[1] 
        # We basically start from the first index and hence an offset has to be added
        # So we keep track of indices which are less than threshold and we keep filtering it away
        order = order[inds + 1]

    return keep
def non_maximum_suppression(self, boxes):
		if len(boxes) > 0:
			nb_class = len(boxes[0].classes)
		else:
			return
			
		for c in range(nb_class):
			sorted_indices = np.argsort([-box.classes[c] for box in boxes])

			for i in range(len(sorted_indices)):
				index_i = sorted_indices[i]

				if boxes[index_i].classes[c] == 0: continue

				for j in range(i+1, len(sorted_indices)):
					index_j = sorted_indices[j]

					if self.bbox_iou(boxes[index_i], boxes[index_j]) >= self.nms_threshold:
						boxes[index_j].classes[c] = 0

		return boxes

@rish-16
Copy link
Author

rish-16 commented Mar 28, 2022

LeNet architecture

class LeNet5(nn.Module):
    def __init__(self):
        super(LeNet5_convnet, self).__init__()

        # CL1:   28 x 28  -->    50 x 28 x 28 
        self.conv1 = nn.Conv2d(1,   50,  kernel_size=3,  padding=1 )
        
        # MP1: 50 x 28 x 28 -->    50 x 14 x 14
        self.pool1  = nn.MaxPool2d(2,2)
        
        # CL2:   50 x 14 x 14  -->    100 x 14 x 14 
        self.conv2 = nn.Conv2d(50,  100,  kernel_size=3,  padding=1 )
        
        # MP2: 100 x 14 x 14 -->    100 x 7 x 7
        self.pool2 = nn.MaxPool2d(2,2)
        
        # LL1:   100 x 7 x 7 = 4900 -->  100 
        self.linear1 = nn.Linear(4900, 100)
        
        # LL2:   100  -->  10 
        self.linear2 = nn.Linear(100,10)


    def forward(self, x):

        # CL1:   28 x 28  -->    50 x 28 x 28 
        x = self.conv1(x)
        x = torch.relu(x)
        
        # MP1: 50 x 28 x 28 -->    50 x 14 x 14
        x = self.pool1(x)
        
        # CL2:   50 x 14 x 14  -->    100 x 14 x 14
        x = self.conv2(x)
        x = torch.relu(x)
        
        # MP2: 100 x 14 x 14 -->    100 x 7 x 7
        x = self.pool2(x)

        # LL1:   100 x 7 x 7 = 4900  -->  100 
        x = x.view(-1, 4900)
        x = self.linear1(x)
        x = torch.relu(x)
        
        # LL2:   4900  -->  10 
        x = self.linear2(x)
    
        return x

@rish-16
Copy link
Author

rish-16 commented Mar 28, 2022

Faster R-CNN architecture

class FasterRCNN(nn.Module):
  def __init__(self, dim, offset, obj_size, n_objects, n_object_classes):
    super().__init__()
    self.conv1 = nn.Conv2d(1, dim, (3,3), padding=1) # 1x28x28 -> dx28x28
    self.conv2 = nn.Conv2d(dim, dim, (3,3), padding=1) # dx28x28 -> dx28x28
    self.classifier_head = nn.Linear(dim * obj_size ** 2, n_object_classes) # dx7x7 -> n

    # built-in region proposal network
    self.conv_anchor = nn.Conv2d(dim, 1, (obj_size, obj_size), padding=offset) # dx28x28 -> 1x28x28
    
  def forward(self, x, bb, training=True):
    bs, c, h, w = x.shape
    x = self.conv1(x) # [bs x 28 x 28]
    x = torch.relu(x) 
    x = self.conv2(x) # [bs x 28 x 28]
    x = torch.relu(x)
    
    scores_box_anchor = self.conv_anchor(x).squeeze() # [bs x 28 x28]
    if training:
      bbox = []
      for b in range(bs):
        for k in range(n_objects):
          bbox.append(
            x[b, : ,bb[b, k, 1].long() - offset:bb[b, k, 1].long() 
              - offset + obj_size, bb[b, k, 0].long() 
              - offset:bb[b, k, 0].long() 
              - offset + ob_size]
            )
      
          bbox = torch.stack(bbox, dim=0) # create tensor [bs * n_objs, dim, obj_size * obj_size]
          bbox = bbox.view(-1, dim * obj_size**2) # [bs * n_objs, dim * obj_size * obj_size]
          scores_bbox_class = self.classifier_head(bbox) # [bs * n_objs, classes]
          
    else:
      batch_bbox = []
      for b in range(bs):
                # compute the coordinates of the top-K bbox anchor scores (K=nb_objects)
                scores_bbox_anch_b = scores_bbox_anch[b,:,:]
                scores_bbox_anch_b = scores_bbox_anch_b.view(-1) # [im_size * img_size]
                
                
                _, idx_largest = torch.sort(scores_bbox_anch_b)
                idx_largest = idx_largest[:nb_objects]
                
                idx_y = idx_largest // im_size # [nb_objects]
                idx_x = idx_largest - idx_y*im_size # [nb_objects]
                
                # extract the top-K bboxes of size [batch_size, nb_objects, hidden_dim, ob_size, ob_size]
                bbox = []
                for k in range(nb_objects):
                    bbox.append(x[b,:,idx_y[k]-offset:idx_y[k]-offset+ob_size,idx_x[k]-offset:idx_x[k]-offset+ob_size])
                    
                bbox = torch.stack(bbox, dim=0) # [nb_objects, hidden_dim, ob_size, ob_size]
                bbox = bbox.view(-1, hidden_dim * ob_size**2) # [nb_objects, hidden_dim*ob_size*ob_size]
                batch_bbox.append(bbox)

            # compute the class scores of the bbox
            # size of tensor scores_bbox_class is [batch_size*nb_objects, nb_class_objects]
            batch_bbox = torch.cat(batch_bbox, 0)
            scores_bbox_class = self.linear_class(batch_bbox)
            
        return scores_bbox_class, scores_bbox_anch

@rish-16
Copy link
Author

rish-16 commented Mar 28, 2022

Semantic Segmentation CNN architecture

class SemanticSegmentCNN(nn.Module):
    
    def __init__(self):
        super(semantic_CNN, self).__init__()

        # downsampling convnet
        self.conv1 = nn.Conv2d(1, dim, (3,3), padding=1, stride=2) #  1x28x28 --> hidden_dim x14x14
        self.conv2 = nn.Conv2d(dim, dim, (3,3), padding=1, stride=2) # hidden_dim x14x14 --> hidden_dim x7x7 

        # upsampling convnet
        self.trans_conv1 = nn.ConvTranspose2d(dim, dim, (4,4), padding=1, stride=2) #  hidden_dim x7x7 --> hidden_dim x14x14
        self.trans_conv2 = nn.ConvTranspose2d(dim, dim, (4,4), padding=1, stride=2) #  hidden_dim x14x14 --> hidden_dim x28x28

        # classification layer
        self.classifier_head = nn.Conv2d(dim, nb_pixel_classes, (3,3), padding=1, stride=1) #  hidden_dim x28x28 --> nb_pixel_classes x28x28
        
    def forward(self, x): 
        # downsampling convnet
        x = self.conv1(x) # [batch_size, hidden_dim, im_size/2, im_size/2] 
        x = torch.relu(x)
        x = self.conv2(x) # [batch_size, hidden_dim, im_size/4, im_size/4] 
        x = torch.relu(x) 
        
        # upsampling convnet
        x = self.trans_conv1(x) # [batch_size, hidden_dim, im_size/2, im_size/2] 
        x = torch.relu(x)
        x = self.trans_conv2(x) # [batch_size, hidden_dim, im_size, im_size] 
        x = torch.relu(x) 

        # classification layer
        scores_pixel_class = self.classifier_head(x) # [batch_size, nb_pixel_classes, im_size, im_size] 
        
        return scores_pixel_class

@rish-16
Copy link
Author

rish-16 commented Mar 29, 2022

Bilinear Interpolation

import torch
dtype = torch.cuda.FloatTensor
dtype_long = torch.cuda.LongTensor

def bilinear_interpolate_torch(im, x, y):
    x0 = torch.floor(x).type(dtype_long)
    x1 = x0 + 1
    
    y0 = torch.floor(y).type(dtype_long)
    y1 = y0 + 1

    x0 = torch.clamp(x0, 0, im.shape[1]-1)
    x1 = torch.clamp(x1, 0, im.shape[1]-1)
    y0 = torch.clamp(y0, 0, im.shape[0]-1)
    y1 = torch.clamp(y1, 0, im.shape[0]-1)
    
    Ia = im[ y0, x0 ][0]
    Ib = im[ y1, x0 ][0]
    Ic = im[ y0, x1 ][0]
    Id = im[ y1, x1 ][0]
    
    wa = (x1.type(dtype)-x) * (y1.type(dtype)-y)
    wb = (x1.type(dtype)-x) * (y-y0.type(dtype))
    wc = (x-x0.type(dtype)) * (y1.type(dtype)-y)
    wd = (x-x0.type(dtype)) * (y-y0.type(dtype))

    return torch.t((torch.t(Ia)*wa)) + torch.t(torch.t(Ib)*wb) + torch.t(torch.t(Ic)*wc) + torch.t(torch.t(Id)*wd)

@tanyjnaaman
Copy link

tanyjnaaman commented Mar 29, 2022

Cleaned up implementation of Fast R-CNN

class VanillaFastRCNN(nn.Module):

    def __init__(self, input_dim, hidden_dim, object_size, classes, n_objects, im_size):
        super(VanillaFastRCNN, self).__init__()

        # metadata
        self.n_objects = n_objects
        self.offset = (object_size - 1)// 2
        self.object_size = object_size
        self.hidden_dim = hidden_dim
        self.im_size = im_size

        # backbone convnet
        self.conv1 = nn.Conv2d(input_dim, hidden_dim, kernel_size = 5, stride = 1, padding = 2) # same size
        self.conv2 = nn.Conv2d(hidden_dim, hidden_dim, kernel_size = 5, stride = 1, padding = 2) # same size
        self.conv3 = nn.Conv2d(hidden_dim, hidden_dim, kernel_size = 5, stride = 1, padding = 2) # same size
        self.activation = nn.functional.relu
        
        # per region network, predicting bbox pixel anchor scores
        self.conv_boundingbox = nn.Conv2d(hidden_dim, 1, kernel_size = object_size, stride = 1, padding = self.offset) # activation map padded to detect object size

        # per region network, predicting region class
        self.linear = nn.Linear(in_features = hidden_dim * object_size **2, out_features = classes) # take object-size feature map and classify

    def forward(self, input_tensor, bounding_box_tensor, train_flag = True):

        # apply backbone convnet for feature extraction
        x = input_tensor
        x = self.conv1(x)
        x = self.activation(x)
        x = self.conv2(x)
        x = self.activation(x)
        x = self.conv3(x)
        x = self.activation(x)

        # predict bounding box anchors
        scores_boundingbox = self.conv_boundingbox(x).squeeze()

        # predict classes for each given bounding box
        batches, c, h, w = input_tensor.shape
        if train_flag:
            boxes = []
            for b in range(batches): # for each image in batch
                for k in range(self.n_objects): # for n objects to be predicted
                    offset = self.offset
                    object_size = self.object_size
                    horizontal_left = bounding_box_tensor[b, k, 0].long() - offset
                    vertical_down = bounding_box_tensor[b, k, 1].long() - offset
                    boxes.append(x[b,:,vertical_down:vertical_down + object_size, horizontal_left:horizontal_left + object_size]) # cut out boxes of size object_size^2
            boxes = torch.stack(boxes, dim = 0) # stack to get tensor of (batch_size * n_objects) * hidden_dim * object_size * object_size
            boxes = boxes.view(-1, self.hidden_dim * object_size * object_size) # reshape for classification with linear layer
            scores_boxes = self.linear(boxes)

        else:
            total_boxes = []
            for b in range(batches):

                # get top n_objects box centres from scores by reshaping into array, then sort
                scores_boundingbox[b].view(-1) # to im_size * im_size
                _, idx_largest = torch.sort(scores_boundingbox, descending = True)
                idx_largest = idx_largest[:self.n_objects] # take top n_objects points as centres
                idx_y = idx_largest//self.im_size  # reshape to y, x coordinate
                idx_x = idx_largest - idx_y*self.im_size

                # after taking out top n_object boxes, cut out region and append to list of boxes, as in training
                boxes = []
                for k in range(self.n_objects): # for n objects to be predicted
                    offset = self.offset
                    object_size = self.object_size
                    horizontal_left = bounding_box_tensor[b, k, 0].long() - offset
                    vertical_down = bounding_box_tensor[b, k, 1].long() - offset
                    boxes.append(x[b,:,vertical_down:vertical_down + object_size, horizontal_left:horizontal_left + object_size]) # cut out boxes of size object_size^2
                boxes = torch.stack(boxes, dim = 0) # stack to get tensor of (batch_size * n_objects) * hidden_dim * object_size * object_size
                boxes = boxes.view(-1, self.hidden_dim * object_size * object_size) # reshape for classification with linear layer
                total_boxes.append(boxes)

            # classify for whole batch
            total_boxes = torch.cat(total_boxes, dim = 0) # list to tensor
            scores_boxes = self.linear(total_boxes)

        return scores_boxes, scores_boundingbox

@rish-16
Copy link
Author

rish-16 commented Mar 29, 2022

Count network parameters

def display_num_param(net):
    nb_param = 0
    for param in net.parameters():
        nb_param += param.numel()
    print('There are {} ({:.2f} million) parameters in this neural network'.format(
        nb_param, nb_param/1e6)
         )

@rish-16
Copy link
Author

rish-16 commented Mar 29, 2022

Mask R-CNN architecture

class MaskRCNN(nn.Module):

    def __init__(self, input_dim, hidden_dim, object_size, classes, n_objects, im_size):
        super(VanillaFastRCNN, self).__init__()

        # metadata
        self.n_objects = n_objects
        self.offset = (object_size - 1)// 2
        self.object_size = object_size
        self.hidden_dim = hidden_dim
        self.im_size = im_size

       # downsampling convnet
        self.ss_conv1 = nn.Conv2d(1, dim, (3,3), padding=1, stride=2) #  1x28x28 --> hidden_dim x14x14
        self.ss_conv2 = nn.Conv2d(dim, dim, (3,3), padding=1, stride=2) # hidden_dim x14x14 --> hidden_dim x7x7 

        # upsampling convnet
        self.ss_trans_conv1 = nn.ConvTranspose2d(dim, dim, (4,4), padding=1, stride=2) #  hidden_dim x7x7 --> hidden_dim x14x14
        self.ss_trans_conv2 = nn.ConvTranspose2d(dim, dim, (4,4), padding=1, stride=2) #  hidden_dim x14x14 --> hidden_dim x28x28

        # classification layer
        self.ss_classifier_head = nn.Conv2d(dim, nb_pixel_classes, (3,3), padding=1, stride=1) #  hidden_dim x28x28 --> nb_pixel_classes x28x28

        # backbone convnet
        self.conv1 = nn.Conv2d(input_dim, hidden_dim, kernel_size = 5, stride = 1, padding = 2) # same size
        self.conv2 = nn.Conv2d(hidden_dim, hidden_dim, kernel_size = 5, stride = 1, padding = 2) # same size
        self.conv3 = nn.Conv2d(hidden_dim, hidden_dim, kernel_size = 5, stride = 1, padding = 2) # same size
        self.activation = nn.functional.relu
        
        # per region network, predicting bbox pixel anchor scores
        self.conv_boundingbox = nn.Conv2d(hidden_dim, 1, kernel_size = object_size, stride = 1, padding = self.offset) # activation map padded to detect object size

        # per region network, predicting region class
        self.linear = nn.Linear(in_features = hidden_dim * object_size **2, out_features = classes) # take object-size feature map and classify

    def forward(self, input_tensor, bounding_box_tensor, train_flag = True):

        # apply backbone convnet for feature extraction
        x = input_tensor
        x = self.conv1(x)
        x = self.activation(x)
        x = self.conv2(x)
        x = self.activation(x)
        x = self.conv3(x)
        x = self.activation(x)

        # predict bounding box anchors
        scores_boundingbox = self.conv_boundingbox(x).squeeze()

        # predict classes for each given bounding box
        batches, c, h, w = input_tensor.shape
        if train_flag:
            boxes = []
            for b in range(batches): # for each image in batch
                for k in range(self.n_objects): # for n objects to be predicted
                    offset = self.offset
                    object_size = self.object_size
                    horizontal_left = bounding_box_tensor[b, k, 0].long() - offset
                    vertical_down = bounding_box_tensor[b, k, 1].long() - offset
                    boxes.append(x[b,:,vertical_down:vertical_down + object_size, horizontal_left:horizontal_left + object_size]) # cut out boxes of size object_size^2
            boxes = torch.stack(boxes, dim = 0) # stack to get tensor of (batch_size * n_objects) * hidden_dim * object_size * object_size
            boxes = boxes.view(-1, self.hidden_dim * object_size * object_size) # reshape for classification with linear layer
            scores_boxes = self.linear(boxes)

        else:
            total_boxes = []
            for b in range(batches):

                # get top n_objects box centres from scores by reshaping into array, then sort
                scores_boundingbox[b].view(-1) # to im_size * im_size
                _, idx_largest = torch.sort(scores_boundingbox, descending = True)
                idx_largest = idx_largest[:self.n_objects] # take top n_objects points as centres
                idx_y = idx_largest//self.im_size  # reshape to y, x coordinate
                idx_x = idx_largest - idx_y*self.im_size

                # after taking out top n_object boxes, cut out region and append to list of boxes, as in training
                boxes = []
                for k in range(self.n_objects): # for n objects to be predicted
                    offset = self.offset
                    object_size = self.object_size
                    horizontal_left = bounding_box_tensor[b, k, 0].long() - offset
                    vertical_down = bounding_box_tensor[b, k, 1].long() - offset
                    boxes.append(x[b,:,vertical_down:vertical_down + object_size, horizontal_left:horizontal_left + object_size]) # cut out boxes of size object_size^2
                boxes = torch.stack(boxes, dim = 0) # stack to get tensor of (batch_size * n_objects) * hidden_dim * object_size * object_size
                boxes = boxes.view(-1, self.hidden_dim * object_size * object_size) # reshape for classification with linear layer
                total_boxes.append(boxes)

            # classify for whole batch
            total_boxes = torch.cat(total_boxes, dim = 0) # list to tensor
            scores_boxes = self.linear(total_boxes)

        # downsampling convnet
        x = input_tensor
        x = self.ss_conv1(x) # [batch_size, hidden_dim, im_size/2, im_size/2] 
        x = torch.relu(x)
        x = self.ss_conv2(x) # [batch_size, hidden_dim, im_size/4, im_size/4] 
        x = torch.relu(x) 
        
        # upsampling convnet
        x = self.ss_trans_conv1(x) # [batch_size, hidden_dim, im_size/2, im_size/2] 
        x = torch.relu(x)
        x = self.ss_trans_conv2(x) # [batch_size, hidden_dim, im_size, im_size] 
        x = torch.relu(x) 

        # classification layer
        scores_pixel_class = self.ss_classifier_head(x) # [batch_size, nb_pixel_classes, im_size, im_size] 

        return scores_boxes, scores_boundingbox, scores_pixel_class

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment