from .voc0712 import VOCDetection, AnnotationTransform, detection_collate, VOC_CLASSES
from .config import *
import cv2
import numpy as np
def base_transform(image, size, mean, std):
x = cv2.resize(image, (size, size)).astype(np.float32)
# x = cv2.resize(np.array(image), (size, size)).astype(np.float32)
x /= 255.0
x -= mean
x /= std # probs remove this
x = x.astype(np.float32)
return x
class BaseTransform:
def __init__(self, size, mean, std):
self.size = size
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
def __call__(self, image, boxes=None, labels=None):
return base_transform(image, self.size, self.mean, self.std), boxes, labels
import torch
from torchvision import transforms
import cv2
import numpy as np
import types
from numpy import random
def intersect(box_a, box_b):
max_xy = np.minimum(box_a[:, 2:], box_b[2:])
min_xy = np.maximum(box_a[:, :2], box_b[:2])
inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
return inter[:, 0] * inter[:, 1]
def jaccard_numpy(box_a, box_b):
"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
is simply the intersection over union of two boxes.
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
box_a: Multiple bounding boxes, Shape: [num_boxes,4]
box_b: Single bounding box, Shape: [4]
jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
inter = intersect(box_a, box_b)
area_a = ((box_a[:, 2]-box_a[:, 0]) *
(box_a[:, 3]-box_a[:, 1])) # [A,B]
area_b = ((box_b[2]-box_b[0]) *
(box_b[3]-box_b[1])) # [A,B]
union = area_a + area_b - inter
return inter / union # [A,B]
class Compose(object):
"""Composes several augmentations together.
transforms (List[Transform]): list of transforms to compose.
>>> augmentations.Compose([
>>> transforms.CenterCrop(10),
>>> transforms.ToTensor(),
>>> ])
def __init__(self, transforms):
self.transforms = transforms
def __call__(self, img, boxes=None, labels=None):
for t in self.transforms:
img, boxes, labels = t(img, boxes, labels)
return img, boxes, labels
class Lambda(object):
"""Applies a lambda as a transform."""
def __init__(self, lambd):
assert isinstance(lambd, types.LambdaType)
self.lambd = lambd
def __call__(self, img, boxes=None, labels=None):
return self.lambd(img, boxes, labels)
class ConvertFromInts(object):
def __call__(self, image, boxes=None, labels=None):
return image.astype(np.float32), boxes, labels
class SubtractMeans(object):
def __init__(self, mean):
self.mean = np.array(mean, dtype=np.float32)
def __call__(self, image, boxes=None, labels=None):
image = image.astype(np.float32)
image -= self.mean
return image.astype(np.float32), boxes, labels
class ToUnitNorm(object):
def __call__(self, image, boxes=None, labels=None):
image = image.astype(np.float32)
image /= 255.0
return image.astype(np.float32), boxes, labels
class MeanNormalize(object):
def __init__(self, mean, std):
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
def __call__(self, image, boxes=None, labels=None):
image = image.astype(np.float32)
image -= self.mean
image /= self.std
return image.astype(np.float32), boxes, labels
class ToAbsoluteCoords(object):
def __call__(self, image, boxes=None, labels=None):
height, width, channels = image.shape
boxes[:, 0] *= width
boxes[:, 2] *= width
boxes[:, 1] *= height
boxes[:, 3] *= height
return image, boxes, labels
class ToPercentCoords(object):
def __call__(self, image, boxes=None, labels=None):
height, width, channels = image.shape
boxes[:, 0] /= width
boxes[:, 2] /= width
boxes[:, 1] /= height
boxes[:, 3] /= height
return image, boxes, labels
class Resize(object):
def __init__(self, size=300):
self.size = size
def __call__(self, image, boxes=None, labels=None):
image = cv2.resize(image, (self.size,
return image, boxes, labels
class RandomSaturation(object):
def __init__(self, lower=0.5, upper=1.5):
self.lower = lower
self.upper = upper
assert self.upper >= self.lower, "contrast upper must be >= lower."
assert self.lower >= 0, "contrast lower must be non-negative."
def __call__(self, image, boxes=None, labels=None):
if random.randint(2):
image[:, :, 1] *= random.uniform(self.lower, self.upper)
return image, boxes, labels
class RandomHue(object):
def __init__(self, delta=18.0):
assert delta >= 0.0 and delta <= 360.0 = delta
def __call__(self, image, boxes=None, labels=None):
if random.randint(2):
image[:, :, 0] += random.uniform(,
image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
return image, boxes, labels
class RandomLightingNoise(object):
def __init__(self):
self.perms = ((0, 1, 2), (0, 2, 1),
(1, 0, 2), (1, 2, 0),
(2, 0, 1), (2, 1, 0))
def __call__(self, image, boxes=None, labels=None):
if random.randint(2):
swap = self.perms[random.randint(len(self.perms))]
shuffle = SwapChannels(swap) # shuffle channels
image = shuffle(image)
return image, boxes, labels
class ConvertColor(object):
def __init__(self, current='BGR', transform='HSV'):
self.transform = transform
self.current = current
def __call__(self, image, boxes=None, labels=None):
if self.current == 'BGR' and self.transform == 'HSV':
image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
elif self.current == 'HSV' and self.transform == 'BGR':
image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
raise NotImplementedError
return image, boxes, labels
class RandomContrast(object):
def __init__(self, lower=0.5, upper=1.5):
self.lower = lower
self.upper = upper
assert self.upper >= self.lower, "contrast upper must be >= lower."
assert self.lower >= 0, "contrast lower must be non-negative."
# expects float image
def __call__(self, image, boxes=None, labels=None):
if random.randint(2):
alpha = random.uniform(self.lower, self.upper)
image *= alpha
return image, boxes, labels
class RandomBrightness(object):
def __init__(self, delta=32):
assert delta >= 0.0
assert delta <= 255.0 = delta
def __call__(self, image, boxes=None, labels=None):
if random.randint(2):
delta = random.uniform(,
image += delta
return image, boxes, labels
class ToCV2Image(object):
def __call__(self, tensor, boxes=None, labels=None):
return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels
class ToTensor(object):
def __call__(self, cvimage, boxes=None, labels=None):
return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels
class RandomSampleCrop(object):
img (Image): the image being input during training
boxes (Tensor): the original bounding boxes in pt form
labels (Tensor): the class labels for each bbox
mode (float tuple): the min and max jaccard overlaps
(img, boxes, classes)
img (Image): the cropped image
boxes (Tensor): the adjusted bounding boxes in pt form
labels (Tensor): the class labels for each bbox
def __init__(self):
self.sample_options = (
# using entire original input image
# sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
(0.1, None),
(0.3, None),
(0.7, None),
(0.9, None),
# randomly sample a patch
(None, None),
def __call__(self, image, boxes=None, labels=None):
height, width, _ = image.shape
while True:
# randomly choose a mode
mode = random.choice(self.sample_options)
if mode is None:
return image, boxes, labels
min_iou, max_iou = mode
if min_iou is None:
min_iou = float('-inf')
if max_iou is None:
max_iou = float('inf')
# max trails (50)
for _ in range(50):
current_image = image
w = random.uniform(0.3 * width, width)
h = random.uniform(0.3 * height, height)
# aspect ratio constraint b/t .5 & 2
if h / w < 0.5 or h / w > 2:
left = random.uniform(width - w)
top = random.uniform(height - h)
# convert to integer rect x1,y1,x2,y2
rect = np.array([int(left), int(top), int(left+w), int(top+h)])
# calculate IoU (jaccard overlap) b/t the cropped and gt boxes
overlap = jaccard_numpy(boxes, rect)
# is min and max overlap constraint satisfied? if not try again
if overlap.min() < min_iou and max_iou < overlap.max():
# cut the crop from the image
current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
# keep overlap with gt box IF center in sampled patch
centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
# mask in all gt boxes that above and to the left of centers
m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
# mask in all gt boxes that under and to the right of centers
m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
# mask in that both m1 and m2 are true
mask = m1 * m2
# have any valid boxes? try again if not
if not mask.any():
# take only matching gt boxes
current_boxes = boxes[mask, :].copy()
# take only matching gt labels
current_labels = labels[mask]
# should we use the box left and top corner or the crop's
current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
# adjust to crop (by substracting crop's left,top)
current_boxes[:, :2] -= rect[:2]
current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
# adjust to crop (by substracting crop's left,top)
current_boxes[:, 2:] -= rect[:2]
return current_image, current_boxes, current_labels
class Expand(object):
def __init__(self, mean):
self.mean = mean
def __call__(self, image, boxes, labels):
if random.randint(2):
return image, boxes, labels
height, width, depth = image.shape
ratio = random.uniform(1, 4)
left = random.uniform(0, width*ratio - width)
top = random.uniform(0, height*ratio - height)
expand_image = np.zeros(
(int(height*ratio), int(width*ratio), depth),
expand_image[:, :, :] = self.mean
expand_image[int(top):int(top + height),
int(left):int(left + width)] = image
image = expand_image
boxes = boxes.copy()
boxes[:, :2] += (int(left), int(top))
boxes[:, 2:] += (int(left), int(top))
return image, boxes, labels
class RandomMirror(object):
def __call__(self, image, boxes, classes):
_, width, _ = image.shape
if random.randint(2):
image = image[:, ::-1]
boxes = boxes.copy()
boxes[:, 0::2] = width - boxes[:, 2::-2]
return image, boxes, classes
class SwapChannels(object):
"""Transforms a tensorized image by swapping the channels in the order
specified in the swap tuple.
swaps (int triple): final order of channels
eg: (2, 1, 0)
def __init__(self, swaps):
self.swaps = swaps
def __call__(self, image):
image (Tensor): image tensor to be transformed
a tensor with channels swapped according to swap
# if torch.is_tensor(image):
# image =
# else:
# image = np.array(image)
image = image[:, :, self.swaps]
return image
class PhotometricDistort(object):
def __init__(self):
self.pd = [
ConvertColor(current='HSV', transform='BGR'),
self.rand_brightness = RandomBrightness()
self.rand_light_noise = RandomLightingNoise()
def __call__(self, image, boxes, labels):
im = image.copy()
im, boxes, labels = self.rand_brightness(im, boxes, labels)
if random.randint(2):
distort = Compose(self.pd[:-1])
distort = Compose(self.pd[1:])
im, boxes, labels = distort(im, boxes, labels)
return self.rand_light_noise(im, boxes, labels)
class SSDAugmentation(object):
def __init__(self, size=160, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)):
self.mean = mean
self.std = std
self.size = size
self.augment = Compose([
MeanNormalize(self.mean, self.std)
def __call__(self, img, boxes, labels):
return self.augment(img, boxes, labels)
import os.path
# gets home dir cross platform
home = os.path.expanduser("~")
#ddir = os.path.join(home,"data/HollywoodHeads/")
ddir = os.path.join(home,"data/VOCdevkit/")
# note: if you used our download scripts, this should be right
VOCroot = ddir # path to VOCdevkit root dir
# default batch size
# data reshuffled at every epoch
# number of subprocesses to use for data loading
v3 = {
'feature_maps' : [20, 10, 5, 3, 1],
'min_dim' : 160,
'steps' : [8, 16, 32, 53, 160],
'min_sizes' : [16, 30, 60, 90, 130],
'max_sizes' : [30, 60, 90, 130, 170],
# 'aspect_ratios' : [[2, 1/2], [2, 1/2, 3, 1/3], [2, 1/2, 3, 1/3],
# [2, 1/2, 3, 1/3], [2, 1/2], [2, 1/2]],
'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2]],
'variance' : [0.1, 0.2],
'clip' : True,
'name' : 'v2',
v2 = {
'feature_maps' : [38, 19, 10, 5, 3, 1],
'min_dim' : 300,
'steps' : [8, 16, 32, 64, 100, 300],
'min_sizes' : [30, 60, 111, 162, 213, 264],
'max_sizes' : [60, 111, 162, 213, 264, 315],
# 'aspect_ratios' : [[2, 1/2], [2, 1/2, 3, 1/3], [2, 1/2, 3, 1/3],
# [2, 1/2, 3, 1/3], [2, 1/2], [2, 1/2]],
'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
'variance' : [0.1, 0.2],
'clip' : True,
'name' : 'v2',
# use average pooling layer as last layer before multibox layers
v1 = {
'feature_maps' : [38, 19, 10, 5, 3, 1],
'min_dim' : 300,
'steps' : [8, 16, 32, 64, 100, 300],
'min_sizes' : [30, 60, 114, 168, 222, 276],
'max_sizes' : [-1, 114, 168, 222, 276, 330],
# 'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]],
'aspect_ratios' : [[1,1,2,1/2],[1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3],
'variance' : [0.1, 0.2],
'clip' : True,
'name' : 'v1',
"""Adapted from:
@longcw faster_rcnn_pytorch:
@rbgirshick py-faster-rcnn
Licensed under The MIT License [see LICENSE for details]
from __future__ import print_function
import cv2
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torchvision.transforms as transforms
from torch.autograd import Variable
from data import VOCroot
from data import VOC_CLASSES as labelmap
import as data
from data import AnnotationTransform, VOCDetection, BaseTransform
from ssd import build_mbnet
import sys
import os
import time
import argparse
import numpy as np
import pickle
import cv2
if sys.version_info[0] == 2:
import xml.etree.cElementTree as ET
import xml.etree.ElementTree as ET
def str2bool(v):
return v.lower() in ("yes", "true", "t", "1")
parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection')
parser.add_argument('--trained_model', default='weights/ssd300_mAP_77.43_v2.pth',
type=str, help='Trained state_dict file path to open')
parser.add_argument('--save_folder', default='eval/', type=str,
help='File path to save results')
parser.add_argument('--confidence_threshold', default=0.01, type=float,
help='Detection confidence threshold')
parser.add_argument('--top_k', default=5, type=int,
help='Further restrict the number of predictions to parse')
parser.add_argument('--cuda', default=True, type=str2bool,
help='Use cuda to train model')
parser.add_argument('--voc_root', default=VOCroot, help='Location of VOC root directory')
args = parser.parse_args()
if not os.path.exists(args.save_folder):
if args.cuda and torch.cuda.is_available():
annopath = os.path.join(args.voc_root, 'VOC2007', 'Annotations', '%s.xml')
imgpath = os.path.join(args.voc_root, 'VOC2007', 'JPEGImages', '%s.jpg')
imgsetpath = os.path.join(args.voc_root, 'VOC2007', 'ImageSets', 'Main', '{:s}.txt')
YEAR = '2007'
devkit_path = VOCroot + 'VOC' + YEAR
dataset_mean = (0.406, 0.456, 0.485)
dataset_std = (0.225, 0.224, 0.229)
set_type = 'test'
class Timer(object):
"""A simple timer."""
def __init__(self):
self.total_time = 0.
self.calls = 0
self.start_time = 0.
self.diff = 0.
self.average_time = 0.
def tic(self):
# using time.time instead of time.clock because time time.clock
# does not normalize for multithreading
self.start_time = time.time()
def toc(self, average=True):
self.diff = time.time() - self.start_time
self.total_time += self.diff
self.calls += 1
self.average_time = self.total_time / self.calls
if average:
return self.average_time
return self.diff
def parse_rec(filename):
""" Parse a PASCAL VOC xml file """
tree = ET.parse(filename)
objects = []
for obj in tree.findall('object'):
obj_struct = {}
obj_struct['name'] = obj.find('name').text
obj_struct['pose'] = obj.find('pose').text
obj_struct['truncated'] = int(obj.find('truncated').text)
obj_struct['difficult'] = int(obj.find('difficult').text)
bbox = obj.find('bndbox')
obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1,
int(bbox.find('ymin').text) - 1,
int(bbox.find('xmax').text) - 1,
int(bbox.find('ymax').text) - 1]
return objects
def get_output_dir(name, phase):
"""Return the directory where experimental artifacts are placed.
If the directory does not exist, it is created.
A canonical path is built using the name from an imdb and a network
(if not None).
filedir = os.path.join(name, phase)
if not os.path.exists(filedir):
return filedir
def get_voc_results_file_template(image_set, cls):
# VOCdevkit/VOC2007/results/det_test_aeroplane.txt
filename = 'det_' + image_set + '_%s.txt' % (cls)
filedir = os.path.join(devkit_path, 'results')
if not os.path.exists(filedir):
path = os.path.join(filedir, filename)
return path
def write_voc_results_file(all_boxes, dataset):
for cls_ind, cls in enumerate(labelmap):
print('Writing {:s} VOC results file'.format(cls))
filename = get_voc_results_file_template(set_type, cls)
with open(filename, 'wt') as f:
for im_ind, index in enumerate(dataset.ids):
dets = all_boxes[cls_ind+1][im_ind]
if dets == []:
# the VOCdevkit expects 1-based indices
for k in range(dets.shape[0]):
f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
format(index[1], dets[k, -1],
dets[k, 0] + 1, dets[k, 1] + 1,
dets[k, 2] + 1, dets[k, 3] + 1))
def do_python_eval(output_dir='output', use_07=True):
cachedir = os.path.join(devkit_path, 'annotations_cache')
aps = []
# The PASCAL VOC metric changed in 2010
use_07_metric = use_07
print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
if not os.path.isdir(output_dir):
for i, cls in enumerate(labelmap):
filename = get_voc_results_file_template(set_type, cls)
rec, prec, ap = voc_eval(
filename, annopath, imgsetpath.format(set_type), cls, cachedir,
ovthresh=0.5, use_07_metric=use_07_metric)
aps += [ap]
print('AP for {} = {:.4f}'.format(cls, ap))
with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
print('Mean AP = {:.4f}'.format(np.mean(aps)))
for ap in aps:
print('Results computed with the **unofficial** Python eval code.')
print('Results should be very close to the official MATLAB eval code.')
def voc_ap(rec, prec, use_07_metric=True):
""" ap = voc_ap(rec, prec, [use_07_metric])
Compute VOC AP given precision and recall.
If use_07_metric is true, uses the
VOC 07 11 point method (default:False).
if use_07_metric:
# 11 point metric
ap = 0.
for t in np.arange(0., 1.1, 0.1):
if np.sum(rec >= t) == 0:
p = 0
p = np.max(prec[rec >= t])
ap = ap + p / 11.
# correct AP calculation
# first append sentinel values at the end
mrec = np.concatenate(([0.], rec, [1.]))
mpre = np.concatenate(([0.], prec, [0.]))
# compute the precision envelope
for i in range(mpre.size - 1, 0, -1):
mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
# to calculate area under PR curve, look for points
# where X axis (recall) changes value
i = np.where(mrec[1:] != mrec[:-1])[0]
# and sum (\Delta recall) * prec
ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
return ap
def voc_eval(detpath,
"""rec, prec, ap = voc_eval(detpath,
Top level function that does the PASCAL VOC evaluation.
detpath: Path to detections
detpath.format(classname) should produce the detection results file.
annopath: Path to annotations
annopath.format(imagename) should be the xml annotations file.
imagesetfile: Text file containing the list of images, one image per line.
classname: Category name (duh)
cachedir: Directory for caching the annotations
[ovthresh]: Overlap threshold (default = 0.5)
[use_07_metric]: Whether to use VOC07's 11 point AP computation
(default False)
# assumes detections are in detpath.format(classname)
# assumes annotations are in annopath.format(imagename)
# assumes imagesetfile is a text file with each line an image name
# cachedir caches the annotations in a pickle file
# first load gt
if not os.path.isdir(cachedir):
cachefile = os.path.join(cachedir, 'annots.pkl')
# read list of images
with open(imagesetfile, 'r') as f:
lines = f.readlines()
imagenames = [x.strip() for x in lines]
if not os.path.isfile(cachefile):
# load annots
recs = {}
for i, imagename in enumerate(imagenames):
recs[imagename] = parse_rec(annopath % (imagename))
if i % 100 == 0:
print('Reading annotation for {:d}/{:d}'.format(
i + 1, len(imagenames)))
# save
print('Saving cached annotations to {:s}'.format(cachefile))
with open(cachefile, 'wb') as f:
pickle.dump(recs, f)
# load
with open(cachefile, 'rb') as f:
recs = pickle.load(f)
# extract gt objects for this class
class_recs = {}
npos = 0
for imagename in imagenames:
R = [obj for obj in recs[imagename] if obj['name'] == classname]
bbox = np.array([x['bbox'] for x in R])
difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
det = [False] * len(R)
npos = npos + sum(~difficult)
class_recs[imagename] = {'bbox': bbox,
'difficult': difficult,
'det': det}
# read dets
detfile = detpath.format(classname)
with open(detfile, 'r') as f:
lines = f.readlines()
if any(lines) == 1:
splitlines = [x.strip().split(' ') for x in lines]
image_ids = [x[0] for x in splitlines]
confidence = np.array([float(x[1]) for x in splitlines])
BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
# sort by confidence
sorted_ind = np.argsort(-confidence)
sorted_scores = np.sort(-confidence)
BB = BB[sorted_ind, :]
image_ids = [image_ids[x] for x in sorted_ind]
# go down dets and mark TPs and FPs
nd = len(image_ids)
tp = np.zeros(nd)
fp = np.zeros(nd)
for d in range(nd):
R = class_recs[image_ids[d]]
bb = BB[d, :].astype(float)
ovmax = -np.inf
BBGT = R['bbox'].astype(float)
if BBGT.size > 0:
# compute overlaps
# intersection
ixmin = np.maximum(BBGT[:, 0], bb[0])
iymin = np.maximum(BBGT[:, 1], bb[1])
ixmax = np.minimum(BBGT[:, 2], bb[2])
iymax = np.minimum(BBGT[:, 3], bb[3])
iw = np.maximum(ixmax - ixmin, 0.)
ih = np.maximum(iymax - iymin, 0.)
inters = iw * ih
uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) +
(BBGT[:, 2] - BBGT[:, 0]) *
(BBGT[:, 3] - BBGT[:, 1]) - inters)
overlaps = inters / uni
ovmax = np.max(overlaps)
jmax = np.argmax(overlaps)
if ovmax > ovthresh:
if not R['difficult'][jmax]:
if not R['det'][jmax]:
tp[d] = 1.
R['det'][jmax] = 1
fp[d] = 1.
fp[d] = 1.
# compute precision recall
fp = np.cumsum(fp)
tp = np.cumsum(tp)
rec = tp / float(npos)
# avoid divide by zero in case the first detection matches a difficult
# ground truth
prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
ap = voc_ap(rec, prec, use_07_metric)
rec = -1.
prec = -1.
ap = -1.
return rec, prec, ap
def test_net(save_folder, net, cuda, dataset, transform, top_k,
im_size=300, thresh=0.05):
"""Test a Fast R-CNN network on an image database."""
num_images = len(dataset)
# all detections are collected into:
# all_boxes[cls][image] = N x 5 array of detections in
# (x1, y1, x2, y2, score)
all_boxes = [[[] for _ in range(num_images)]
for _ in range(len(labelmap)+1)]
# timers
_t = {'im_detect': Timer(), 'misc': Timer()}
output_dir = get_output_dir('ssd300_120000', set_type)
det_file = os.path.join(output_dir, 'detections.pkl')
for i in range(num_images):
im, gt, h, w = dataset.pull_item(i)
x = Variable(im.unsqueeze(0))
if args.cuda:
x = x.cuda()
detections = net(x).data
detect_time = _t['im_detect'].toc(average=False)
# skip j = 0, because it's the background class
for j in range(1, detections.size(1)):
dets = detections[0, j, :]
mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t()
dets = torch.masked_select(dets, mask).view(-1, 5)
if dets.dim() == 0:
boxes = dets[:, 1:]
boxes[:, 0] *= w
boxes[:, 2] *= w
boxes[:, 1] *= h
boxes[:, 3] *= h
scores = dets[:, 0].cpu().numpy()
cls_dets = np.hstack((boxes.cpu().numpy(), scores[:, np.newaxis])) \
.astype(np.float32, copy=False)
all_boxes[j][i] = cls_dets
print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1,
num_images, detect_time))
with open(det_file, 'wb') as f:
pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL)
print('Evaluating detections')
evaluate_detections(all_boxes, output_dir, dataset)
def evaluate_detections(box_list, output_dir, dataset):
write_voc_results_file(box_list, dataset)
if __name__ == '__main__':
# load net
net = build_mbnet('test', 160, 21) # initialize SSD
print('Finished loading model!')
# load data
dataset = VOCDetection(args.voc_root, [('2007', set_type)], BaseTransform(160, dataset_mean, dataset_std), AnnotationTransform())
if args.cuda:
net = net.cuda()
cudnn.benchmark = True
# evaluation
test_net(args.save_folder, net, args.cuda, dataset,
BaseTransform(net.size, dataset_mean, dataset_std), args.top_k, 160,
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable
from layers import *
from data import v3
import os
def weight_init(m):
if isinstance(m, nn.Conv2d):
class MBNet(nn.Module):
"""Single Shot Multibox Architecture
The network is composed of a base MBNet network followed by the
added multibox conv layers. Each multibox layer branches into
1) conv2d for class conf scores
2) conv2d for localization predictions
3) associated priorbox layer to produce default bounding
boxes specific to the layer's feature map size.
See: for more details.
phase: (string) Can be "test" or "train"
def __init__(self, phase, num_classes):
super(MBNet, self).__init__()
self.phase = phase
self.num_classes = num_classes
# TODO: implement __call__ in PriorBox
self.priorbox = PriorBox(v3)
self.priors = Variable(self.priorbox.forward(), volatile=True)
self.size = 160
# SSD network
def conv_bn(inp, oup, stride):
return nn.Sequential(
nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
def conv_dw(inp, oup, stride):
return nn.Sequential(
nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
def conv_mb(inp, mid, oup, stride, pad=1):
return nn.Sequential(
nn.Conv2d(inp, mid, 1, 1, 0, bias=False),
nn.Conv2d(mid, oup, 3, stride, pad, bias=False),
self.head = nn.ModuleList([
conv_bn( 3, 32, 2),
conv_dw( 32, 64, 1),
conv_dw( 64, 128, 2),
conv_dw(128, 128, 1),
conv_dw(128, 256, 2),
conv_dw(256, 256, 1),
conv_dw(256, 512, 2),
conv_dw(512, 512, 1)
self.ssdconv1 = conv_mb(512, 128, 256, 2)
self.ssdconv2 = conv_mb(256, 128, 256, 1, 0)
self.ssdconv3 = conv_mb(256, 64, 128, 1, 0)
#xavier init
# Add localization and confidence lists
self.loc = nn.ModuleList([
nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1),
nn.Conv2d(512, 6 * 4, kernel_size=3, padding=1),
nn.Conv2d(256, 6 * 4, kernel_size=3, padding=1),
nn.Conv2d(256, 6 * 4, kernel_size=3, padding=1),
nn.Conv2d(128, 4 * 4, kernel_size=3, padding=1)
self.conf = nn.ModuleList([
nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1),
nn.Conv2d(512, 6 * num_classes, kernel_size=3, padding=1),
nn.Conv2d(256, 6 * num_classes, kernel_size=3, padding=1),
nn.Conv2d(256, 6 * num_classes, kernel_size=3, padding=1),
nn.Conv2d(128, 4 * num_classes, kernel_size=3, padding=1)
if phase == 'test':
self.softmax = nn.Softmax()
self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)
def forward(self, x):
"""Applies network layers and ops on input image(s) x.
x: input image or batch of images. Shape: [batch,3*batch,300,300].
Depending on phase:
Variable(tensor) of output class label predictions,
confidence score, and corresponding location predictions for
each object detected. Shape: [batch,topk,7]
list of concat outputs from:
1: confidence layers, Shape: [batch*num_priors,num_classes]
2: localization layers, Shape: [batch,num_priors*4]
3: priorbox layers, Shape: [2,num_priors*4]
sources = list()
loc = list()
conf = list()
for i in range(0, 6):
x = self.head[i](x)
for i in range(6, 8):
x = self.head[i](x)
x = self.ssdconv1(x)
x = self.ssdconv2(x)
x = self.ssdconv3(x)
# apply multibox head to source layers
for (x, l, c) in zip(sources, self.loc, self.conf):
loc.append(l(x).permute(0, 2, 3, 1).contiguous())
conf.append(c(x).permute(0, 2, 3, 1).contiguous())
loc =[o.view(o.size(0), -1) for o in loc], 1)
conf =[o.view(o.size(0), -1) for o in conf], 1)
if self.phase == "test":
output = self.detect(
loc.view(loc.size(0), -1, 4), # loc preds
self.softmax(conf.view(-1, self.num_classes)), # conf preds
self.priors # default boxes
output = (
loc.view(loc.size(0), -1, 4),
conf.view(conf.size(0), -1, self.num_classes),
return output
def load_weights(self, base_file):
other, ext = os.path.splitext(base_file)
if ext == '.pkl' or '.pth':
print('Loading weights into state dict...')
self.load_state_dict(torch.load(base_file, map_location=lambda storage, loc: storage))
print('Sorry only .pth and .pkl files supported.')
def build_mbnet(phase, size=160, num_classes=2):
if phase != "test" and phase != "train":
print("Error: Phase not recognized")
if size != 160:
print("Error: Sorry only MBNet160 is supported currently!")
return MBNet(phase, num_classes)
from __future__ import print_function
import os
import cv2
import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torch.nn.init as init
import argparse
from torch.autograd import Variable
import as data
from data import v3, AnnotationTransform, VOCDetection, detection_collate, VOCroot
from utils.augmentations import SSDAugmentation
from layers.modules import MultiBoxLoss
from ssd import build_mbnet
import numpy as np
import time
def str2bool(v):
return v.lower() in ("yes", "true", "t", "1")
parser = argparse.ArgumentParser(description='Single Shot MultiBox Detector Training')
parser.add_argument('--jaccard_threshold', default=0.5, type=float, help='Min Jaccard index for matching')
parser.add_argument('--batch_size', default=64, type=int, help='Batch size for training')
parser.add_argument('--resume', default=None, type=str, help='Resume from checkpoint')
parser.add_argument('--freeze', default=False, type=str2bool, help='Freeze pretrained subgraph')
parser.add_argument('--num_workers', default=4, type=int, help='Number of workers used in dataloading')
parser.add_argument('--iterations', default=60000, type=int, help='Number of training iterations')
parser.add_argument('--start_iter', default=0, type=int, help='Begin counting iterations starting from this value (should be used with resume)')
parser.add_argument('--cuda', default=True, type=str2bool, help='Use cuda to train model')
parser.add_argument('--lr', '--learning-rate', default=4e-3, type=float, help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, help='momentum')
parser.add_argument('--weight_decay', default=5e-4, type=float, help='Weight decay for SGD')
parser.add_argument('--gamma', default=0.1, type=float, help='Gamma update for SGD')
parser.add_argument('--log_iters', default=True, type=bool, help='Print the loss at each iteration')
parser.add_argument('--save_folder', default='weights/', help='Location to save checkpoint models')
parser.add_argument('--voc_root', default=VOCroot, help='Location of VOC root directory')
args = parser.parse_args()
if args.cuda and torch.cuda.is_available():
cfg = v3
if not os.path.exists(args.save_folder):
#train_sets = ['filtered']
train_sets = [('2007', 'trainval'), ('2012', 'trainval')]
ssd_dim = 160
means = (0.406, 0.456, 0.485) # only support voc now
stds = (0.225, 0.224, 0.229)
num_classes = 21
batch_size = args.batch_size
accum_batch_size = 128
iter_size = accum_batch_size / batch_size
# Backup default values
max_iter = 60000
weight_decay = 0.0005
stepvalues = (45000, 50000, 55000)
gamma = 0.1
momentum = 0.9
ssd_net = build_mbnet('train', ssd_dim, num_classes)
net = ssd_net
if args.cuda:
net = torch.nn.DataParallel(ssd_net)
if args.resume:
print('Resuming training, loading {}...'.format(args.resume))
if not args.freeze:
print('Ensure all parameters are learnable...')
for param in ssd_net.parameters():
param.requires_grad = True
print('Loading pretrained head...')
mbnet_head = torch.load('weights/mbnethead.pth')
if args.freeze:
print('Freezing head subgraph...')
for param in ssd_net.head.parameters():
param.requires_grad = False
if args.cuda:
cudnn.benchmark = True
optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()),,
momentum=args.momentum, weight_decay=args.weight_decay)
criterion = MultiBoxLoss(num_classes, 0.5, True, 0, True, 3, 0.5, False, args.cuda)
def train():
# loss counters
loc_loss = 0 # epoch
conf_loss = 0
epoch = 0
print('Loading Dataset...')
dataset = VOCDetection(args.voc_root, train_sets, SSDAugmentation(
ssd_dim, means, stds), AnnotationTransform())
epoch_size = len(dataset) // args.batch_size
print('Training SSD on',
step_index = 0
batch_iterator = None
data_loader = data.DataLoader(dataset, batch_size, num_workers=args.num_workers,
shuffle=True, collate_fn=detection_collate)
for iteration in range(args.start_iter, max_iter):
if (not batch_iterator) or (iteration % epoch_size == 0):
# create batch iterator
batch_iterator = iter(data_loader)
if iteration in stepvalues:
step_index += 1
adjust_learning_rate(optimizer, args.gamma, step_index)
# reset epoch loss counters
loc_loss = 0
conf_loss = 0
epoch += 1
# load train data
images, targets = next(batch_iterator)
if args.cuda:
images = Variable(images.cuda())
targets = [Variable(anno.cuda()) for anno in targets]
images = Variable(images)
targets = [Variable(anno) for anno in targets]
# forward
t0 = time.time()
out = net(images)
# backprop
loss_l, loss_c = criterion(out, targets)
loss = loss_l + loss_c
t1 = time.time()
loc_loss +=[0]
conf_loss +=[0]
if iteration % 10 == 0:
print('Timer: %.4f sec.' % (t1 - t0))
print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % ([0]), end=' ')
if iteration % 100 == 0:
print('Saving state, iter:', iteration), 'weights/mbnet_iter_' +
repr(iteration) + '.pth'), args.save_folder + 'mbnet_final.pth')
def adjust_learning_rate(optimizer, gamma, step):
"""Sets the learning rate to the initial LR decayed by 10 at every specified step
# Adapted from PyTorch Imagenet example:
lr = * (gamma ** (step))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
if __name__ == '__main__':
