import math
import torch
import torch.nn as nn
import numpy as np
from torch.utils.checkpoint import checkpoint
def convert_x1y1x2y2_to_xywh(box):
convert box with [x1, y1, x2, y2] to [x, y, w, h]
:param box:(N,4) it can be a tensor or numpy array.
:return: (N,4) tensor or numpy array.
cx = (box[:, 0] + box[:, 2]) / 2.
cy = (box[:, 1] + box[:, 3]) / 2.
cw = box[:, 2] - box[:, 0] + 1
ch = box[:, 3] - box[:, 1] + 1
if torch.is_tensor(box):
return, 1), cy.view(-1, 1), cw.view(-1, 1), ch.view(-1, 1)), 1)
return np.concatenate((cx.reshape(-1, 1), cy.reshape(-1, 1), cw.reshape(-1, 1), ch.reshape(-1, 1)), 1)
class RegionLoss(nn.Module):
def __init__(self, anchors, n_classes, coord_scale=1, reduction=32, noobject_scale=1, object_scale=5, class_scale=1,
super(RegionLoss, self).__init__()
self.anchors = torch.from_numpy(anchors).float()
self.coord_scale = coord_scale
self.reduction = reduction
self.noobject_scale = noobject_scale
self.object_scale = object_scale
self.class_scale = class_scale
self.thresh = thresh
self.coord_prefill = coord_prefill
self.n_classes = n_classes
self.register_buffer('seen', torch.tensor(0))
def forward(self, output, target, seen=None):
anchors = self.anchors.clone()
nH = output.size(2)
nW = output.size(3)
if anchors.max() <= 1:
anchors[:, 0] *= nH
anchors[:, 1] *= nW
nB = output.size(0)
nA = len(anchors)
nC = self.n_classes
nPixels = nH * nW
if seen is not None:
self.seen = torch.tensor(seen)
self.seen += nB
output = output.view(nB, nA, -1, nPixels)
coord = torch.zeros_like(output[:, :, :4])
coord[:, :, :2] = output[:, :, :2].sigmoid() # tx,ty
coord[:, :, 2:4] = output[:, :, 2:4] # tw,th
conf = output[:, :, 4].sigmoid()
if nC > 1:
cls = output[:, :, 5:].contiguous().view(nB * nA, nC, nPixels).transpose(1, 2).contiguous().view(-1, nC)
# Create prediction boxes
pred_boxes = torch.FloatTensor(nB * nA * nPixels, 4)
lin_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).view(nPixels)
lin_y = torch.linspace(0, nH - 1, nH).view(nH, 1).repeat(1, nW).view(nPixels)
anchor_w = anchors[:, 0].contiguous().view(nA, 1)
anchor_h = anchors[:, 1].contiguous().view(nA, 1)
coord_ = coord.clone()
if coord_.is_cuda:
coord_ = coord_.cpu()
pred_boxes[:, 0] = (coord_[:, :, 0].detach() + lin_x).view(-1)
pred_boxes[:, 1] = (coord_[:, :, 1].detach() + lin_y).view(-1)
pred_boxes[:, 2] = (coord_[:, :, 2].detach().exp() * anchor_w).view(-1)
pred_boxes[:, 3] = (coord_[:, :, 3].detach().exp() * anchor_h).view(-1)
pred_boxes = pred_boxes.cpu()
coord_mask, conf_mask, cls_mask, tcoord, tconf, tcls = self.build_targets(pred_boxes, target, nH, nW, anchors)
coord_mask = coord_mask.expand_as(tcoord).sqrt()
conf_mask = conf_mask.sqrt()
# tcoord = tcoord
# tconf = tconf
if nC > 1:
tcls = tcls[cls_mask].view(-1).long()
cls_mask = cls_mask.view(-1, 1).repeat(1, nC)
cls = cls[cls_mask].view(-1, nC)
if coord.is_cuda:
tcoord = tcoord.cuda()
tconf = tconf.cuda()
tcls = tcls.cuda()
coord_mask = coord_mask.cuda()
conf_mask = conf_mask.cuda()
# Compute losses
mse = nn.MSELoss(size_average=False)
self.loss_coord = self.coord_scale * mse(coord * coord_mask, tcoord * coord_mask) / nB
self.loss_conf = nn.MSELoss(size_average=False)(conf * conf_mask, tconf * conf_mask) / nB
if nC > 1:
self.loss_cls = self.class_scale * 2 * nn.CrossEntropyLoss(size_average=False)(cls, tcls) / nB
self.loss_tot = self.loss_coord + self.loss_conf + self.loss_cls
self.loss_cls = None
self.loss_tot = self.loss_coord + self.loss_conf
loss_dict = {}
loss_dict['coord'] = self.loss_coord.item()
loss_dict['conf'] = self.loss_conf.item()
loss_dict['cls'] = self.loss_cls.item()
return self.loss_tot, loss_dict
def build_targets(self, pred_boxes, target, nH, nW, anchors):
nB = len(target)
nA = len(anchors)
nAnchors = nA * nH * nW
nPixels = nH * nW
conf_mask = torch.ones(nB, nA, nPixels, requires_grad=False) * self.noobject_scale
coord_mask = torch.zeros(nB, nA, 1, nPixels, requires_grad=False)
cls_mask = torch.zeros(nB, nA, nPixels, requires_grad=False).byte()
tcoord = torch.zeros(nB, nA, 4, nPixels, requires_grad=False)
tconf = torch.zeros(nB, nA, nPixels, requires_grad=False)
tcls = torch.zeros(nB, nA, nPixels, requires_grad=False)
if self.seen < self.coord_prefill:
tcoord[:, :, 0].fill_(0.5)
tcoord[:, :, 1].fill_(0.5)
for b in range(nB):
gt = target[b][target[b][:, -1] > -1]
if gt.shape[0] == 0:
gt = torch.from_numpy(gt).float()
cur_pred_boxes = pred_boxes[b * nAnchors: (b + 1) * nAnchors]
anchors_ =[torch.zeros_like(anchors), anchors], 1)
gt_box = gt[:, :4]
gt_cls = gt[:, -1]
gt_wh = gt_box / self.reduction
gt_wh = convert_x1y1x2y2_to_xywh(gt_wh)
iou_gt_pred = bbox_ious(gt_wh, cur_pred_boxes)
mask = (iou_gt_pred > self.thresh).sum(0) >= 1
conf_mask[b][mask.view_as(conf_mask[b])] = 0
gt_wh_ = gt_wh.clone()
gt_wh_[:, :2] = 0
iou_gt_anchors = bbox_ious(gt_wh_, anchors_)
_, best_anchors = iou_gt_anchors.max(1)
gt_size = gt.size(0)
for i in range(gt_size):
gi = min(nW - 1, max(0, int(gt_wh[i, 0])))
gj = min(nH - 1, max(0, int(gt_wh[i, 1])))
best_n = best_anchors[i]
iou = iou_gt_pred[i][best_n * nPixels + gj * nW + gi]
coord_mask[b][best_n][0][gj * nW + gi] = 2 - (gt_wh[i, 2] * gt_wh[i, 3]) / nPixels
cls_mask[b][best_n][gj * nW + gi] = 1
conf_mask[b][best_n][gj * nW + gi] = self.object_scale
tcoord[b][best_n][0][gj * nW + gi] = gt_wh[i, 0] - gi
tcoord[b][best_n][1][gj * nW + gi] = gt_wh[i, 1] - gj
tcoord[b][best_n][2][gj * nW + gi] = math.log(gt_wh[i, 2] / anchors[best_n, 0])
tcoord[b][best_n][3][gj * nW + gi] = math.log(gt_wh[i, 3] / anchors[best_n, 1])
tconf[b][best_n][gj * nW + gi] = iou
tcls[b][best_n][gj * nW + gi] = gt_cls[i]
return coord_mask, conf_mask, cls_mask, tcoord, tconf, tcls
def bbox_ious(boxes1, boxes2):
""" Compute IOU between all boxes from ``boxes1`` with all boxes from ``boxes2``.
boxes1 (torch.Tensor): List of bounding boxes
boxes2 (torch.Tensor): List of bounding boxes
List format: [[xc, yc, w, h],...]
b1x1, b1y1 = (boxes1[:, :2] - (boxes1[:, 2:4] / 2)).split(1, 1)
b1x2, b1y2 = (boxes1[:, :2] + (boxes1[:, 2:4] / 2)).split(1, 1)
b2x1, b2y1 = (boxes2[:, :2] - (boxes2[:, 2:4] / 2)).split(1, 1)
b2x2, b2y2 = (boxes2[:, :2] + (boxes2[:, 2:4] / 2)).split(1, 1)
dx = (b1x2.min(b2x2.t()) - b1x1.max(b2x1.t())).clamp(min=0)
dy = (b1y2.min(b2y2.t()) - b1y1.max(b2y1.t())).clamp(min=0)
intersections = dx * dy
areas1 = (b1x2 - b1x1) * (b1y2 - b1y1)
areas2 = (b2x2 - b2x1) * (b2y2 - b2y1)
unions = (areas1 + areas2.t()) - intersections
return intersections / unions
class Conv_1x1(nn.Module):
def __init__(self, in_planes, out_planes):
super(Conv_1x1, self).__init__()
self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1) = nn.BatchNorm2d(out_planes)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
x = self.conv(x)
x =
x = self.relu(x)
return x
class MiniModel(nn.Module):
def __init__(self):
super(MiniModel, self).__init__()
dim = 256
self.up5 = nn.Upsample(scale_factor=8, mode='nearest')
self.up4 = nn.Upsample(scale_factor=8, mode='nearest')
self.up3 = nn.Upsample(scale_factor=4, mode='nearest')
self.up2 = nn.Upsample(scale_factor=2, mode='nearest')
self.layer1_1 = Conv_1x1(64, dim)
self.layer2_1 = Conv_1x1(256, dim)
self.layer3_1 = Conv_1x1(512, dim)
self.layer4_1 = Conv_1x1(1024, dim)
self.layer5_1 = Conv_1x1(1024, dim)
self.out1 = nn.Conv2d(1280, 35, kernel_size=1)
def forward(self, layer1, layer2, layer3, layer4, layer5):
n1 = checkpoint(self.layer1_1, layer1)
n2 = checkpoint(self.layer2_1, layer2)
n3 = checkpoint(self.layer3_1, layer3)
n4 = checkpoint(self.layer4_1, layer4)
n5 = checkpoint(self.layer5_1, layer5)
layer2 = self.up2(n2)
layer3 = self.up3(n3)
layer4 = self.up4(n4)
layer5 = self.up5(n5)
x =[n1, layer2, layer3, layer4, layer5], 1)
out1 = self.out1(x)
return out1
layer1 = torch.autograd.variable(torch.randn(2, 64, 208, 208), requires_grad=True).cuda()
layer2 = torch.autograd.variable(torch.randn(2, 256, 104, 104), requires_grad=True).cuda()
layer3 = torch.autograd.variable(torch.randn(2, 512, 52, 52), requires_grad=True).cuda()
layer4 = torch.autograd.variable(torch.randn(2, 1024, 26, 26), requires_grad=True).cuda()
layer5 = torch.autograd.variable(torch.randn(2, 1024, 26, 26), requires_grad=True).cuda()
labels = [np.array([[8., 159., 20., 178., 1.],
[354., 275., 373., 292., 1.],
[324., 4., 335., 20., 1.],
[286., 257., 307., 283., 1.],
[163., 11., 178., 38., 1.],
[221., 19., 234., 44., 1.]]),
np.array([[355., 26., 359., 29., 1.],
[357., 345., 363., 353., 1.],
[291., 11., 295., 14., 1.],
[146., 261., 149., 263., 1.],
[356., 6., 368., 18., 1.],
[223., 220., 230., 224., 1.],
[314., 177., 317., 180., 1.],
[146., 263., 149., 266., 1.],
[14., 272., 22., 278., 1.],
[301., 343., 304., 347., 1.],
[246., 34., 249., 36., 1.],
[194., 55., 197., 58., 1.],
[22., 214., 26., 219., 1.],
[318., 162., 329., 170., 1.],
model = MiniModel()
model = torch.nn.DataParallel(model)
model = model.cuda()
anchors = np.array([[0.01710912, 0.02316356],
[0.04235875, 0.04513844],
[0.05432437, 0.07076002],
[0.09045923, 0.09586145],
[0.15661931, 0.16021108]])
criterion1 = RegionLoss(anchors=anchors,
out1 = model(layer1, layer2, layer3, layer4, layer5)
loss, loss_dict = criterion1(out1, labels)
