igrekun/init.py

## __init__.py
from .voc0712 import VOCDetection, AnnotationTransform, detection_collate, VOC_CLASSES
from .config import *
import cv2
import numpy as np


def base_transform(image, size, mean, std):
    x = cv2.resize(image, (size, size)).astype(np.float32)
    # x = cv2.resize(np.array(image), (size, size)).astype(np.float32)
    x /= 255.0
    x -= mean
    x /= std # probs remove this
    x = x.astype(np.float32)
    return x


class BaseTransform:
    def __init__(self, size, mean, std):
        self.size = size
        self.mean = np.array(mean, dtype=np.float32)
        self.std  = np.array(std,  dtype=np.float32)

    def __call__(self, image, boxes=None, labels=None):
        return base_transform(image, self.size, self.mean, self.std), boxes, labels

## augmentations.py
import torch
from torchvision import transforms
import cv2
import numpy as np
import types
from numpy import random


def intersect(box_a, box_b):
    max_xy = np.minimum(box_a[:, 2:], box_b[2:])
    min_xy = np.maximum(box_a[:, :2], box_b[:2])
    inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
    return inter[:, 0] * inter[:, 1]


def jaccard_numpy(box_a, box_b):
    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
    is simply the intersection over union of two boxes.
    E.g.:
        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
    Args:
        box_a: Multiple bounding boxes, Shape: [num_boxes,4]
        box_b: Single bounding box, Shape: [4]
    Return:
        jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
    """
    inter = intersect(box_a, box_b)
    area_a = ((box_a[:, 2]-box_a[:, 0]) *
              (box_a[:, 3]-box_a[:, 1]))  # [A,B]
    area_b = ((box_b[2]-box_b[0]) *
              (box_b[3]-box_b[1]))  # [A,B]
    union = area_a + area_b - inter
    return inter / union  # [A,B]


class Compose(object):
    """Composes several augmentations together.
    Args:
        transforms (List[Transform]): list of transforms to compose.
    Example:
        >>> augmentations.Compose([
        >>>     transforms.CenterCrop(10),
        >>>     transforms.ToTensor(),
        >>> ])
    """

    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, boxes=None, labels=None):
        for t in self.transforms:
            img, boxes, labels = t(img, boxes, labels)
        return img, boxes, labels


class Lambda(object):
    """Applies a lambda as a transform."""

    def __init__(self, lambd):
        assert isinstance(lambd, types.LambdaType)
        self.lambd = lambd

    def __call__(self, img, boxes=None, labels=None):
        return self.lambd(img, boxes, labels)


class ConvertFromInts(object):
    def __call__(self, image, boxes=None, labels=None):
        return image.astype(np.float32), boxes, labels


class SubtractMeans(object):
    def __init__(self, mean):
        self.mean = np.array(mean, dtype=np.float32)

    def __call__(self, image, boxes=None, labels=None):
        image = image.astype(np.float32)
        image -= self.mean
        return image.astype(np.float32), boxes, labels


class ToUnitNorm(object):
    def __call__(self, image, boxes=None, labels=None):
        image = image.astype(np.float32)
        image /= 255.0
        return image.astype(np.float32), boxes, labels


class MeanNormalize(object):
    def __init__(self, mean, std):
        self.mean = np.array(mean, dtype=np.float32)
        self.std  = np.array(std, dtype=np.float32)

    def __call__(self, image, boxes=None, labels=None):
        image = image.astype(np.float32)
        image -= self.mean
        image /= self.std
        return image.astype(np.float32), boxes, labels


class ToAbsoluteCoords(object):
    def __call__(self, image, boxes=None, labels=None):
        height, width, channels = image.shape
        boxes[:, 0] *= width
        boxes[:, 2] *= width
        boxes[:, 1] *= height
        boxes[:, 3] *= height

        return image, boxes, labels


class ToPercentCoords(object):
    def __call__(self, image, boxes=None, labels=None):
        height, width, channels = image.shape
        boxes[:, 0] /= width
        boxes[:, 2] /= width
        boxes[:, 1] /= height
        boxes[:, 3] /= height

        return image, boxes, labels


class Resize(object):
    def __init__(self, size=300):
        self.size = size

    def __call__(self, image, boxes=None, labels=None):
        image = cv2.resize(image, (self.size,
                                 self.size))
        return image, boxes, labels


class RandomSaturation(object):
    def __init__(self, lower=0.5, upper=1.5):
        self.lower = lower
        self.upper = upper
        assert self.upper >= self.lower, "contrast upper must be >= lower."
        assert self.lower >= 0, "contrast lower must be non-negative."

    def __call__(self, image, boxes=None, labels=None):
        if random.randint(2):
            image[:, :, 1] *= random.uniform(self.lower, self.upper)

        return image, boxes, labels


class RandomHue(object):
    def __init__(self, delta=18.0):
        assert delta >= 0.0 and delta <= 360.0
        self.delta = delta

    def __call__(self, image, boxes=None, labels=None):
        if random.randint(2):
            image[:, :, 0] += random.uniform(-self.delta, self.delta)
            image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
            image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
        return image, boxes, labels


class RandomLightingNoise(object):
    def __init__(self):
        self.perms = ((0, 1, 2), (0, 2, 1),
                      (1, 0, 2), (1, 2, 0),
                      (2, 0, 1), (2, 1, 0))

    def __call__(self, image, boxes=None, labels=None):
        if random.randint(2):
            swap = self.perms[random.randint(len(self.perms))]
            shuffle = SwapChannels(swap)  # shuffle channels
            image = shuffle(image)
        return image, boxes, labels


class ConvertColor(object):
    def __init__(self, current='BGR', transform='HSV'):
        self.transform = transform
        self.current = current

    def __call__(self, image, boxes=None, labels=None):
        if self.current == 'BGR' and self.transform == 'HSV':
            image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        elif self.current == 'HSV' and self.transform == 'BGR':
            image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
        else:
            raise NotImplementedError
        return image, boxes, labels


class RandomContrast(object):
    def __init__(self, lower=0.5, upper=1.5):
        self.lower = lower
        self.upper = upper
        assert self.upper >= self.lower, "contrast upper must be >= lower."
        assert self.lower >= 0, "contrast lower must be non-negative."

    # expects float image
    def __call__(self, image, boxes=None, labels=None):
        if random.randint(2):
            alpha = random.uniform(self.lower, self.upper)
            image *= alpha
        return image, boxes, labels


class RandomBrightness(object):
    def __init__(self, delta=32):
        assert delta >= 0.0
        assert delta <= 255.0
        self.delta = delta

    def __call__(self, image, boxes=None, labels=None):
        if random.randint(2):
            delta = random.uniform(-self.delta, self.delta)
            image += delta
        return image, boxes, labels


class ToCV2Image(object):
    def __call__(self, tensor, boxes=None, labels=None):
        return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels


class ToTensor(object):
    def __call__(self, cvimage, boxes=None, labels=None):
        return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels


class RandomSampleCrop(object):
    """Crop
    Arguments:
        img (Image): the image being input during training
        boxes (Tensor): the original bounding boxes in pt form
        labels (Tensor): the class labels for each bbox
        mode (float tuple): the min and max jaccard overlaps
    Return:
        (img, boxes, classes)
            img (Image): the cropped image
            boxes (Tensor): the adjusted bounding boxes in pt form
            labels (Tensor): the class labels for each bbox
    """
    def __init__(self):
        self.sample_options = (
            # using entire original input image
            None,
            # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
            (0.1, None),
            (0.3, None),
            (0.7, None),
            (0.9, None),
            # randomly sample a patch
            (None, None),
        )

    def __call__(self, image, boxes=None, labels=None):
        height, width, _ = image.shape
        while True:
            # randomly choose a mode
            mode = random.choice(self.sample_options)
            if mode is None:
                return image, boxes, labels

            min_iou, max_iou = mode
            if min_iou is None:
                min_iou = float('-inf')
            if max_iou is None:
                max_iou = float('inf')

            # max trails (50)
            for _ in range(50):
                current_image = image

                w = random.uniform(0.3 * width, width)
                h = random.uniform(0.3 * height, height)

                # aspect ratio constraint b/t .5 & 2
                if h / w < 0.5 or h / w > 2:
                    continue

                left = random.uniform(width - w)
                top = random.uniform(height - h)

                # convert to integer rect x1,y1,x2,y2
                rect = np.array([int(left), int(top), int(left+w), int(top+h)])

                # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
                overlap = jaccard_numpy(boxes, rect)

                # is min and max overlap constraint satisfied? if not try again
                if overlap.min() < min_iou and max_iou < overlap.max():
                    continue

                # cut the crop from the image
                current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
                                              :]

                # keep overlap with gt box IF center in sampled patch
                centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0

                # mask in all gt boxes that above and to the left of centers
                m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])

                # mask in all gt boxes that under and to the right of centers
                m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])

                # mask in that both m1 and m2 are true
                mask = m1 * m2

                # have any valid boxes? try again if not
                if not mask.any():
                    continue

                # take only matching gt boxes
                current_boxes = boxes[mask, :].copy()

                # take only matching gt labels
                current_labels = labels[mask]

                # should we use the box left and top corner or the crop's
                current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
                                                  rect[:2])
                # adjust to crop (by substracting crop's left,top)
                current_boxes[:, :2] -= rect[:2]

                current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
                                                  rect[2:])
                # adjust to crop (by substracting crop's left,top)
                current_boxes[:, 2:] -= rect[:2]

                return current_image, current_boxes, current_labels


class Expand(object):
    def __init__(self, mean):
        self.mean = mean

    def __call__(self, image, boxes, labels):
        if random.randint(2):
            return image, boxes, labels

        height, width, depth = image.shape
        ratio = random.uniform(1, 4)
        left = random.uniform(0, width*ratio - width)
        top = random.uniform(0, height*ratio - height)

        expand_image = np.zeros(
            (int(height*ratio), int(width*ratio), depth),
            dtype=image.dtype)
        expand_image[:, :, :] = self.mean
        expand_image[int(top):int(top + height),
                     int(left):int(left + width)] = image
        image = expand_image

        boxes = boxes.copy()
        boxes[:, :2] += (int(left), int(top))
        boxes[:, 2:] += (int(left), int(top))

        return image, boxes, labels


class RandomMirror(object):
    def __call__(self, image, boxes, classes):
        _, width, _ = image.shape
        if random.randint(2):
            image = image[:, ::-1]
            boxes = boxes.copy()
            boxes[:, 0::2] = width - boxes[:, 2::-2]
        return image, boxes, classes


class SwapChannels(object):
    """Transforms a tensorized image by swapping the channels in the order
     specified in the swap tuple.
    Args:
        swaps (int triple): final order of channels
            eg: (2, 1, 0)
    """

    def __init__(self, swaps):
        self.swaps = swaps

    def __call__(self, image):
        """
        Args:
            image (Tensor): image tensor to be transformed
        Return:
            a tensor with channels swapped according to swap
        """
        # if torch.is_tensor(image):
        #     image = image.data.cpu().numpy()
        # else:
        #     image = np.array(image)
        image = image[:, :, self.swaps]
        return image


class PhotometricDistort(object):
    def __init__(self):
        self.pd = [
            RandomContrast(),
            ConvertColor(transform='HSV'),
            RandomSaturation(),
            RandomHue(),
            ConvertColor(current='HSV', transform='BGR'),
            RandomContrast()
        ]
        self.rand_brightness = RandomBrightness()
        self.rand_light_noise = RandomLightingNoise()

    def __call__(self, image, boxes, labels):
        im = image.copy()
        im, boxes, labels = self.rand_brightness(im, boxes, labels)
        if random.randint(2):
            distort = Compose(self.pd[:-1])
        else:
            distort = Compose(self.pd[1:])
        im, boxes, labels = distort(im, boxes, labels)
        return self.rand_light_noise(im, boxes, labels)


class SSDAugmentation(object):
    def __init__(self, size=160, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)):
        self.mean = mean
        self.std  = std
        self.size = size
        self.augment = Compose([
            ConvertFromInts(),
            ToAbsoluteCoords(),
            PhotometricDistort(),
            Expand(self.mean),
            RandomSampleCrop(),
            RandomMirror(),
            ToPercentCoords(),
            Resize(self.size),
            ToUnitNorm(),
            MeanNormalize(self.mean, self.std)
        ])

    def __call__(self, img, boxes, labels):
        return self.augment(img, boxes, labels)

## config.py
# config.py
import os.path

# gets home dir cross platform
home = os.path.expanduser("~")
#ddir = os.path.join(home,"data/HollywoodHeads/")
ddir = os.path.join(home,"data/VOCdevkit/")

# note: if you used our download scripts, this should be right
VOCroot = ddir # path to VOCdevkit root dir

# default batch size
BATCHES = 64
# data reshuffled at every epoch
SHUFFLE = True
# number of subprocesses to use for data loading
WORKERS = 4


#MBNET CONFIG
v3 = {
    'feature_maps' : [20, 10, 5, 3, 1],

    'min_dim' : 160,

    'steps' : [8, 16, 32, 53, 160],

    'min_sizes' : [16, 30, 60, 90, 130],

    'max_sizes' : [30, 60, 90, 130, 170],

    # 'aspect_ratios' : [[2, 1/2], [2, 1/2, 3, 1/3], [2, 1/2, 3, 1/3],
    #                    [2, 1/2, 3, 1/3], [2, 1/2], [2, 1/2]],
    'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2]],

    'variance' : [0.1, 0.2],

    'clip' : True,

    'name' : 'v2',
}

v2 = {
    'feature_maps' : [38, 19, 10, 5, 3, 1],

    'min_dim' : 300,

    'steps' : [8, 16, 32, 64, 100, 300],

    'min_sizes' : [30, 60, 111, 162, 213, 264],

    'max_sizes' : [60, 111, 162, 213, 264, 315],

    # 'aspect_ratios' : [[2, 1/2], [2, 1/2, 3, 1/3], [2, 1/2, 3, 1/3],
    #                    [2, 1/2, 3, 1/3], [2, 1/2], [2, 1/2]],
    'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2], [2]],

    'variance' : [0.1, 0.2],

    'clip' : True,

    'name' : 'v2',
}

# use average pooling layer as last layer before multibox layers
v1 = {
    'feature_maps' : [38, 19, 10, 5, 3, 1],

    'min_dim' : 300,

    'steps' : [8, 16, 32, 64, 100, 300],

    'min_sizes' : [30, 60, 114, 168, 222, 276],

    'max_sizes' : [-1, 114, 168, 222, 276, 330],

    # 'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]],
    'aspect_ratios' : [[1,1,2,1/2],[1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3],
                        [1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3]],

    'variance' : [0.1, 0.2],

    'clip' : True,

    'name' : 'v1',
}

## eval.py
"""Adapted from:
    @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch
    @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn
    Licensed under The MIT License [see LICENSE for details]
"""

from __future__ import print_function
import cv2
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torchvision.transforms as transforms
from torch.autograd import Variable
from data import VOCroot
from data import VOC_CLASSES as labelmap
import torch.utils.data as data

from data import AnnotationTransform, VOCDetection, BaseTransform
from ssd import build_mbnet

import sys
import os
import time
import argparse
import numpy as np
import pickle
import cv2

if sys.version_info[0] == 2:
    import xml.etree.cElementTree as ET
else:
    import xml.etree.ElementTree as ET

def str2bool(v):
    return v.lower() in ("yes", "true", "t", "1")

parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection')
parser.add_argument('--trained_model', default='weights/ssd300_mAP_77.43_v2.pth',
                    type=str, help='Trained state_dict file path to open')
parser.add_argument('--save_folder', default='eval/', type=str,
                    help='File path to save results')
parser.add_argument('--confidence_threshold', default=0.01, type=float,
                    help='Detection confidence threshold')
parser.add_argument('--top_k', default=5, type=int,
                    help='Further restrict the number of predictions to parse')
parser.add_argument('--cuda', default=True, type=str2bool,
                    help='Use cuda to train model')
parser.add_argument('--voc_root', default=VOCroot, help='Location of VOC root directory')

args = parser.parse_args()

if not os.path.exists(args.save_folder):
    os.mkdir(args.save_folder)

if args.cuda and torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
else:
    torch.set_default_tensor_type('torch.FloatTensor')

annopath = os.path.join(args.voc_root, 'VOC2007', 'Annotations', '%s.xml')
imgpath = os.path.join(args.voc_root, 'VOC2007', 'JPEGImages', '%s.jpg')
imgsetpath = os.path.join(args.voc_root, 'VOC2007', 'ImageSets', 'Main', '{:s}.txt')
YEAR = '2007'
devkit_path = VOCroot + 'VOC' + YEAR
dataset_mean = (0.406, 0.456, 0.485)
dataset_std  = (0.225, 0.224, 0.229)
set_type = 'test'

class Timer(object):
    """A simple timer."""
    def __init__(self):
        self.total_time = 0.
        self.calls = 0
        self.start_time = 0.
        self.diff = 0.
        self.average_time = 0.

    def tic(self):
        # using time.time instead of time.clock because time time.clock
        # does not normalize for multithreading
        self.start_time = time.time()

    def toc(self, average=True):
        self.diff = time.time() - self.start_time
        self.total_time += self.diff
        self.calls += 1
        self.average_time = self.total_time / self.calls
        if average:
            return self.average_time
        else:
            return self.diff


def parse_rec(filename):
    """ Parse a PASCAL VOC xml file """
    tree = ET.parse(filename)
    objects = []
    for obj in tree.findall('object'):
        obj_struct = {}
        obj_struct['name'] = obj.find('name').text
        obj_struct['pose'] = obj.find('pose').text
        obj_struct['truncated'] = int(obj.find('truncated').text)
        obj_struct['difficult'] = int(obj.find('difficult').text)
        bbox = obj.find('bndbox')
        obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1,
                              int(bbox.find('ymin').text) - 1,
                              int(bbox.find('xmax').text) - 1,
                              int(bbox.find('ymax').text) - 1]
        objects.append(obj_struct)

    return objects


def get_output_dir(name, phase):
    """Return the directory where experimental artifacts are placed.
    If the directory does not exist, it is created.
    A canonical path is built using the name from an imdb and a network
    (if not None).
    """
    filedir = os.path.join(name, phase)
    if not os.path.exists(filedir):
        os.makedirs(filedir)
    return filedir


def get_voc_results_file_template(image_set, cls):
    # VOCdevkit/VOC2007/results/det_test_aeroplane.txt
    filename = 'det_' + image_set + '_%s.txt' % (cls)
    filedir = os.path.join(devkit_path, 'results')
    if not os.path.exists(filedir):
        os.makedirs(filedir)
    path = os.path.join(filedir, filename)
    return path


def write_voc_results_file(all_boxes, dataset):
    for cls_ind, cls in enumerate(labelmap):
        print('Writing {:s} VOC results file'.format(cls))
        filename = get_voc_results_file_template(set_type, cls)
        with open(filename, 'wt') as f:
            for im_ind, index in enumerate(dataset.ids):
                dets = all_boxes[cls_ind+1][im_ind]
                if dets == []:
                    continue
                # the VOCdevkit expects 1-based indices
                for k in range(dets.shape[0]):
                    f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
                            format(index[1], dets[k, -1],
                                   dets[k, 0] + 1, dets[k, 1] + 1,
                                   dets[k, 2] + 1, dets[k, 3] + 1))


def do_python_eval(output_dir='output', use_07=True):
    cachedir = os.path.join(devkit_path, 'annotations_cache')
    aps = []
    # The PASCAL VOC metric changed in 2010
    use_07_metric = use_07
    print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    for i, cls in enumerate(labelmap):
        filename = get_voc_results_file_template(set_type, cls)
        rec, prec, ap = voc_eval(
           filename, annopath, imgsetpath.format(set_type), cls, cachedir,
           ovthresh=0.5, use_07_metric=use_07_metric)
        aps += [ap]
        print('AP for {} = {:.4f}'.format(cls, ap))
        with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
            pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
    print('Mean AP = {:.4f}'.format(np.mean(aps)))
    print('~~~~~~~~')
    print('Results:')
    for ap in aps:
        print('{:.3f}'.format(ap))
    print('{:.3f}'.format(np.mean(aps)))
    print('~~~~~~~~')
    print('')
    print('--------------------------------------------------------------')
    print('Results computed with the **unofficial** Python eval code.')
    print('Results should be very close to the official MATLAB eval code.')
    print('--------------------------------------------------------------')


def voc_ap(rec, prec, use_07_metric=True):
    """ ap = voc_ap(rec, prec, [use_07_metric])
    Compute VOC AP given precision and recall.
    If use_07_metric is true, uses the
    VOC 07 11 point method (default:False).
    """
    if use_07_metric:
        # 11 point metric
        ap = 0.
        for t in np.arange(0., 1.1, 0.1):
            if np.sum(rec >= t) == 0:
                p = 0
            else:
                p = np.max(prec[rec >= t])
            ap = ap + p / 11.
    else:
        # correct AP calculation
        # first append sentinel values at the end
        mrec = np.concatenate(([0.], rec, [1.]))
        mpre = np.concatenate(([0.], prec, [0.]))

        # compute the precision envelope
        for i in range(mpre.size - 1, 0, -1):
            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])

        # to calculate area under PR curve, look for points
        # where X axis (recall) changes value
        i = np.where(mrec[1:] != mrec[:-1])[0]

        # and sum (\Delta recall) * prec
        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
    return ap


def voc_eval(detpath,
             annopath,
             imagesetfile,
             classname,
             cachedir,
             ovthresh=0.5,
             use_07_metric=True):
    """rec, prec, ap = voc_eval(detpath,
                           annopath,
                           imagesetfile,
                           classname,
                           [ovthresh],
                           [use_07_metric])
Top level function that does the PASCAL VOC evaluation.
detpath: Path to detections
   detpath.format(classname) should produce the detection results file.
annopath: Path to annotations
   annopath.format(imagename) should be the xml annotations file.
imagesetfile: Text file containing the list of images, one image per line.
classname: Category name (duh)
cachedir: Directory for caching the annotations
[ovthresh]: Overlap threshold (default = 0.5)
[use_07_metric]: Whether to use VOC07's 11 point AP computation
   (default False)
"""
# assumes detections are in detpath.format(classname)
# assumes annotations are in annopath.format(imagename)
# assumes imagesetfile is a text file with each line an image name
# cachedir caches the annotations in a pickle file
# first load gt
    if not os.path.isdir(cachedir):
        os.mkdir(cachedir)
    cachefile = os.path.join(cachedir, 'annots.pkl')
    # read list of images
    with open(imagesetfile, 'r') as f:
        lines = f.readlines()
    imagenames = [x.strip() for x in lines]
    if not os.path.isfile(cachefile):
        # load annots
        recs = {}
        for i, imagename in enumerate(imagenames):
            recs[imagename] = parse_rec(annopath % (imagename))
            if i % 100 == 0:
                print('Reading annotation for {:d}/{:d}'.format(
                   i + 1, len(imagenames)))
        # save
        print('Saving cached annotations to {:s}'.format(cachefile))
        with open(cachefile, 'wb') as f:
            pickle.dump(recs, f)
    else:
        # load
        with open(cachefile, 'rb') as f:
            recs = pickle.load(f)

    # extract gt objects for this class
    class_recs = {}
    npos = 0
    for imagename in imagenames:
        R = [obj for obj in recs[imagename] if obj['name'] == classname]
        bbox = np.array([x['bbox'] for x in R])
        difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
        det = [False] * len(R)
        npos = npos + sum(~difficult)
        class_recs[imagename] = {'bbox': bbox,
                                 'difficult': difficult,
                                 'det': det}

    # read dets
    detfile = detpath.format(classname)
    with open(detfile, 'r') as f:
        lines = f.readlines()
    if any(lines) == 1:

        splitlines = [x.strip().split(' ') for x in lines]
        image_ids = [x[0] for x in splitlines]
        confidence = np.array([float(x[1]) for x in splitlines])
        BB = np.array([[float(z) for z in x[2:]] for x in splitlines])

        # sort by confidence
        sorted_ind = np.argsort(-confidence)
        sorted_scores = np.sort(-confidence)
        BB = BB[sorted_ind, :]
        image_ids = [image_ids[x] for x in sorted_ind]

        # go down dets and mark TPs and FPs
        nd = len(image_ids)
        tp = np.zeros(nd)
        fp = np.zeros(nd)
        for d in range(nd):
            R = class_recs[image_ids[d]]
            bb = BB[d, :].astype(float)
            ovmax = -np.inf
            BBGT = R['bbox'].astype(float)
            if BBGT.size > 0:
                # compute overlaps
                # intersection
                ixmin = np.maximum(BBGT[:, 0], bb[0])
                iymin = np.maximum(BBGT[:, 1], bb[1])
                ixmax = np.minimum(BBGT[:, 2], bb[2])
                iymax = np.minimum(BBGT[:, 3], bb[3])
                iw = np.maximum(ixmax - ixmin, 0.)
                ih = np.maximum(iymax - iymin, 0.)
                inters = iw * ih
                uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) +
                       (BBGT[:, 2] - BBGT[:, 0]) *
                       (BBGT[:, 3] - BBGT[:, 1]) - inters)
                overlaps = inters / uni
                ovmax = np.max(overlaps)
                jmax = np.argmax(overlaps)

            if ovmax > ovthresh:
                if not R['difficult'][jmax]:
                    if not R['det'][jmax]:
                        tp[d] = 1.
                        R['det'][jmax] = 1
                    else:
                        fp[d] = 1.
            else:
                fp[d] = 1.

        # compute precision recall
        fp = np.cumsum(fp)
        tp = np.cumsum(tp)
        rec = tp / float(npos)
        # avoid divide by zero in case the first detection matches a difficult
        # ground truth
        prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
        ap = voc_ap(rec, prec, use_07_metric)
    else:
        rec = -1.
        prec = -1.
        ap = -1.

    return rec, prec, ap


def test_net(save_folder, net, cuda, dataset, transform, top_k,
             im_size=300, thresh=0.05):
    """Test a Fast R-CNN network on an image database."""
    num_images = len(dataset)
    # all detections are collected into:
    #    all_boxes[cls][image] = N x 5 array of detections in
    #    (x1, y1, x2, y2, score)
    all_boxes = [[[] for _ in range(num_images)]
                 for _ in range(len(labelmap)+1)]

    # timers
    _t = {'im_detect': Timer(), 'misc': Timer()}
    output_dir = get_output_dir('ssd300_120000', set_type)
    det_file = os.path.join(output_dir, 'detections.pkl')

    for i in range(num_images):
        im, gt, h, w = dataset.pull_item(i)

        x = Variable(im.unsqueeze(0))
        if args.cuda:
            x = x.cuda()
        _t['im_detect'].tic()
        detections = net(x).data
        detect_time = _t['im_detect'].toc(average=False)

        # skip j = 0, because it's the background class
        for j in range(1, detections.size(1)):
            dets = detections[0, j, :]
            mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t()
            dets = torch.masked_select(dets, mask).view(-1, 5)
            if dets.dim() == 0:
                continue
            boxes = dets[:, 1:]
            boxes[:, 0] *= w
            boxes[:, 2] *= w
            boxes[:, 1] *= h
            boxes[:, 3] *= h
            scores = dets[:, 0].cpu().numpy()
            cls_dets = np.hstack((boxes.cpu().numpy(), scores[:, np.newaxis])) \
                .astype(np.float32, copy=False)
            all_boxes[j][i] = cls_dets

        print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1,
                                                    num_images, detect_time))

    with open(det_file, 'wb') as f:
        pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL)

    print('Evaluating detections')
    evaluate_detections(all_boxes, output_dir, dataset)


def evaluate_detections(box_list, output_dir, dataset):
    write_voc_results_file(box_list, dataset)
    do_python_eval(output_dir)


if __name__ == '__main__':
    # load net
    net = build_mbnet('test', 160, 21)    # initialize SSD
    net.load_state_dict(torch.load(args.trained_model))
    net.eval()
    print('Finished loading model!')
    # load data
    dataset = VOCDetection(args.voc_root, [('2007', set_type)], BaseTransform(160, dataset_mean, dataset_std), AnnotationTransform())
    if args.cuda:
        net = net.cuda()
        cudnn.benchmark = True
    # evaluation
    test_net(args.save_folder, net, args.cuda, dataset,
             BaseTransform(net.size, dataset_mean, dataset_std), args.top_k, 160,
             thresh=args.confidence_threshold)

## ssd.py
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable
from layers import *
from data import v3
import os


def weight_init(m):
    if isinstance(m, nn.Conv2d):
        init.xavier_uniform(m.weight.data)

class MBNet(nn.Module):
    """Single Shot Multibox Architecture
    The network is composed of a base MBNet network followed by the
    added multibox conv layers.  Each multibox layer branches into
        1) conv2d for class conf scores
        2) conv2d for localization predictions
        3) associated priorbox layer to produce default bounding
           boxes specific to the layer's feature map size.
    See: https://arxiv.org/pdf/1512.02325.pdf for more details.

    Args:
        phase: (string) Can be "test" or "train"
    """

    def __init__(self, phase, num_classes):
        super(MBNet, self).__init__()
        self.phase = phase
        self.num_classes = num_classes
        # TODO: implement __call__ in PriorBox
        self.priorbox = PriorBox(v3)
        self.priors = Variable(self.priorbox.forward(), volatile=True)
        self.size = 160

        # SSD network
        def conv_bn(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
                nn.BatchNorm2d(oup),
                nn.ReLU(inplace=True)
            )

        def conv_dw(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
                nn.BatchNorm2d(inp),
                nn.ReLU(inplace=True),

                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
                nn.ReLU(inplace=True),
            )

        def conv_mb(inp, mid, oup, stride, pad=1):
            return nn.Sequential(
                nn.Conv2d(inp, mid, 1, 1, 0, bias=False),
                nn.BatchNorm2d(mid),
                nn.LeakyReLU(inplace=True),

                nn.Conv2d(mid, oup, 3, stride, pad, bias=False),
                nn.BatchNorm2d(oup),
                nn.LeakyReLU(inplace=True),
            )

        self.head = nn.ModuleList([
            conv_bn(  3,  32, 2),
            conv_dw( 32,  64, 1),
            conv_dw( 64, 128, 2),
            conv_dw(128, 128, 1),
            conv_dw(128, 256, 2),
            conv_dw(256, 256, 1),
            conv_dw(256, 512, 2),
            conv_dw(512, 512, 1)
        ])

        self.ssdconv1 = conv_mb(512, 128, 256, 2)
        self.ssdconv2 = conv_mb(256, 128, 256, 1, 0)
        self.ssdconv3 = conv_mb(256,  64, 128, 1, 0)

        #xavier init
        self.ssdconv1.apply(weight_init)
        self.ssdconv2.apply(weight_init)
        self.ssdconv3.apply(weight_init)

        # Add localization and confidence lists
        self.loc = nn.ModuleList([
            nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1),
            nn.Conv2d(512, 6 * 4, kernel_size=3, padding=1),
            nn.Conv2d(256, 6 * 4, kernel_size=3, padding=1),
            nn.Conv2d(256, 6 * 4, kernel_size=3, padding=1),
            nn.Conv2d(128, 4 * 4, kernel_size=3, padding=1)
        ])

        self.conf = nn.ModuleList([
            nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(512, 6 * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(256, 6 * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(256, 6 * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(128, 4 * num_classes, kernel_size=3, padding=1)
        ])

        if phase == 'test':
            self.softmax = nn.Softmax()
            self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)

    def forward(self, x):
        """Applies network layers and ops on input image(s) x.

        Args:
            x: input image or batch of images. Shape: [batch,3*batch,300,300].

        Return:
            Depending on phase:
            test:
                Variable(tensor) of output class label predictions,
                confidence score, and corresponding location predictions for
                each object detected. Shape: [batch,topk,7]

            train:
                list of concat outputs from:
                    1: confidence layers, Shape: [batch*num_priors,num_classes]
                    2: localization layers, Shape: [batch,num_priors*4]
                    3: priorbox layers, Shape: [2,num_priors*4]
        """
        sources = list()
        loc = list()
        conf = list()

        for i in range(0, 6):
            x = self.head[i](x)
        sources.append(x)
        for i in range(6, 8):
            x = self.head[i](x)
        sources.append(x)
        x = self.ssdconv1(x)
        sources.append(x)
        x = self.ssdconv2(x)
        sources.append(x)
        x = self.ssdconv3(x)
        sources.append(x)

        # apply multibox head to source layers
        for (x, l, c) in zip(sources, self.loc, self.conf):
            loc.append(l(x).permute(0, 2, 3, 1).contiguous())
            conf.append(c(x).permute(0, 2, 3, 1).contiguous())

        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)

        if self.phase == "test":
            output = self.detect(
                loc.view(loc.size(0), -1, 4),                   # loc preds
                self.softmax(conf.view(-1, self.num_classes)),  # conf preds
                self.priors                                     # default boxes
            )
        else:
            output = (
                loc.view(loc.size(0), -1, 4),
                conf.view(conf.size(0), -1, self.num_classes),
                self.priors
            )
        return output

    def load_weights(self, base_file):
        other, ext = os.path.splitext(base_file)
        if ext == '.pkl' or '.pth':
            print('Loading weights into state dict...')
            self.load_state_dict(torch.load(base_file, map_location=lambda storage, loc: storage))
            print('Finished!')
        else:
            print('Sorry only .pth and .pkl files supported.')


def build_mbnet(phase, size=160, num_classes=2):
        if phase != "test" and phase != "train":
            print("Error: Phase not recognized")
            return
        if size != 160:
            print("Error: Sorry only MBNet160 is supported currently!")
            return

        return MBNet(phase, num_classes)

## train.py
from __future__ import print_function
import os
import cv2
import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torch.nn.init as init
import argparse
from torch.autograd import Variable
import torch.utils.data as data
from data import v3, AnnotationTransform, VOCDetection, detection_collate, VOCroot
from utils.augmentations import SSDAugmentation
from layers.modules import MultiBoxLoss
from ssd import build_mbnet
import numpy as np
import time

def str2bool(v):
    return v.lower() in ("yes", "true", "t", "1")

parser = argparse.ArgumentParser(description='Single Shot MultiBox Detector Training')
parser.add_argument('--jaccard_threshold', default=0.5, type=float, help='Min Jaccard index for matching')
parser.add_argument('--batch_size', default=64, type=int, help='Batch size for training')
parser.add_argument('--resume', default=None, type=str, help='Resume from checkpoint')
parser.add_argument('--freeze', default=False, type=str2bool, help='Freeze pretrained subgraph')
parser.add_argument('--num_workers', default=4, type=int, help='Number of workers used in dataloading')
parser.add_argument('--iterations', default=60000, type=int, help='Number of training iterations')
parser.add_argument('--start_iter', default=0, type=int, help='Begin counting iterations starting from this value (should be used with resume)')
parser.add_argument('--cuda', default=True, type=str2bool, help='Use cuda to train model')
parser.add_argument('--lr', '--learning-rate', default=4e-3, type=float, help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, help='momentum')
parser.add_argument('--weight_decay', default=5e-4, type=float, help='Weight decay for SGD')
parser.add_argument('--gamma', default=0.1, type=float, help='Gamma update for SGD')
parser.add_argument('--log_iters', default=True, type=bool, help='Print the loss at each iteration')
parser.add_argument('--save_folder', default='weights/', help='Location to save checkpoint models')
parser.add_argument('--voc_root', default=VOCroot, help='Location of VOC root directory')
args = parser.parse_args()

if args.cuda and torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
else:
    torch.set_default_tensor_type('torch.FloatTensor')

cfg = v3

if not os.path.exists(args.save_folder):
    os.mkdir(args.save_folder)

#train_sets = ['filtered']
train_sets = [('2007', 'trainval'), ('2012', 'trainval')]
ssd_dim = 160
means = (0.406, 0.456, 0.485)  # only support voc now
stds  = (0.225, 0.224, 0.229)
num_classes = 21
batch_size = args.batch_size
accum_batch_size = 128
iter_size = accum_batch_size / batch_size
# Backup default values
max_iter = 60000
weight_decay = 0.0005
stepvalues = (45000, 50000, 55000)
gamma = 0.1
momentum = 0.9

ssd_net = build_mbnet('train', ssd_dim, num_classes)
net = ssd_net

if args.cuda:
    net = torch.nn.DataParallel(ssd_net)

if args.resume:
    print('Resuming training, loading {}...'.format(args.resume))
    ssd_net.load_weights(args.resume)
    if not args.freeze:
        print('Ensure all parameters are learnable...')
        for param in ssd_net.parameters():
            param.requires_grad = True
else:
    print('Loading pretrained head...')
    mbnet_head = torch.load('weights/mbnethead.pth')
    ssd_net.head.load_state_dict(mbnet_head)
    if args.freeze:
        print('Freezing head subgraph...')
        for param in ssd_net.head.parameters():
            param.requires_grad = False

if args.cuda:
    net.cuda()
    cudnn.benchmark = True

optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=args.lr,
                      momentum=args.momentum, weight_decay=args.weight_decay)
criterion = MultiBoxLoss(num_classes, 0.5, True, 0, True, 3, 0.5, False, args.cuda)


def train():
    net.train()
    # loss counters
    loc_loss = 0  # epoch
    conf_loss = 0
    epoch = 0
    print('Loading Dataset...')

    dataset = VOCDetection(args.voc_root, train_sets, SSDAugmentation(
        ssd_dim, means, stds), AnnotationTransform())

    epoch_size = len(dataset) // args.batch_size
    print('Training SSD on', dataset.name)
    step_index = 0

    batch_iterator = None
    data_loader = data.DataLoader(dataset, batch_size, num_workers=args.num_workers,
                                  shuffle=True, collate_fn=detection_collate)
    for iteration in range(args.start_iter, max_iter):
        if (not batch_iterator) or (iteration % epoch_size == 0):
            # create batch iterator
            batch_iterator = iter(data_loader)
        if iteration in stepvalues:
            step_index += 1
            adjust_learning_rate(optimizer, args.gamma, step_index)

            # reset epoch loss counters
            loc_loss = 0
            conf_loss = 0
            epoch += 1

        # load train data
        images, targets = next(batch_iterator)

        if args.cuda:
            images = Variable(images.cuda())
            targets = [Variable(anno.cuda()) for anno in targets]
        else:
            images = Variable(images)
            targets = [Variable(anno) for anno in targets]
        # forward
        t0 = time.time()
        out = net(images)
        # backprop
        optimizer.zero_grad()
        loss_l, loss_c = criterion(out, targets)
        loss = loss_l + loss_c
        loss.backward()
        optimizer.step()
        t1 = time.time()
        loc_loss += loss_l.data[0]
        conf_loss += loss_c.data[0]
        if iteration % 10 == 0:
            print('Timer: %.4f sec.' % (t1 - t0))
            print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data[0]), end=' ')
        if iteration % 100 == 0:
            print('Saving state, iter:', iteration)
            torch.save(ssd_net.state_dict(), 'weights/mbnet_iter_' +
                       repr(iteration) + '.pth')
    torch.save(ssd_net.state_dict(), args.save_folder + 'mbnet_final.pth')


def adjust_learning_rate(optimizer, gamma, step):
    """Sets the learning rate to the initial LR decayed by 10 at every specified step
    # Adapted from PyTorch Imagenet example:
    # https://github.com/pytorch/examples/blob/master/imagenet/main.py
    """
    lr = args.lr * (gamma ** (step))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


if __name__ == '__main__':
    train()
	from .voc0712 import VOCDetection, AnnotationTransform, detection_collate, VOC_CLASSES
	from .config import *
	import cv2
	import numpy as np


	def base_transform(image, size, mean, std):
	x = cv2.resize(image, (size, size)).astype(np.float32)
	# x = cv2.resize(np.array(image), (size, size)).astype(np.float32)
	x /= 255.0
	x -= mean
	x /= std # probs remove this
	x = x.astype(np.float32)
	return x


	class BaseTransform:
	def __init__(self, size, mean, std):
	self.size = size
	self.mean = np.array(mean, dtype=np.float32)
	self.std = np.array(std, dtype=np.float32)

	def __call__(self, image, boxes=None, labels=None):
	return base_transform(image, self.size, self.mean, self.std), boxes, labels
	import torch
	from torchvision import transforms
	import cv2
	import numpy as np
	import types
	from numpy import random


	def intersect(box_a, box_b):
	max_xy = np.minimum(box_a[:, 2:], box_b[2:])
	min_xy = np.maximum(box_a[:, :2], box_b[:2])
	inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
	return inter[:, 0] * inter[:, 1]


	def jaccard_numpy(box_a, box_b):
	"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
	is simply the intersection over union of two boxes.
	E.g.:
	A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
	Args:
	box_a: Multiple bounding boxes, Shape: [num_boxes,4]
	box_b: Single bounding box, Shape: [4]
	Return:
	jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
	"""
	inter = intersect(box_a, box_b)
	area_a = ((box_a[:, 2]-box_a[:, 0]) *
	(box_a[:, 3]-box_a[:, 1])) # [A,B]
	area_b = ((box_b[2]-box_b[0]) *
	(box_b[3]-box_b[1])) # [A,B]
	union = area_a + area_b - inter
	return inter / union # [A,B]


	class Compose(object):
	"""Composes several augmentations together.
	Args:
	transforms (List[Transform]): list of transforms to compose.
	Example:
	>>> augmentations.Compose([
	>>> transforms.CenterCrop(10),
	>>> transforms.ToTensor(),
	>>> ])
	"""

	def __init__(self, transforms):
	self.transforms = transforms

	def __call__(self, img, boxes=None, labels=None):
	for t in self.transforms:
	img, boxes, labels = t(img, boxes, labels)
	return img, boxes, labels


	class Lambda(object):
	"""Applies a lambda as a transform."""

	def __init__(self, lambd):
	assert isinstance(lambd, types.LambdaType)
	self.lambd = lambd

	def __call__(self, img, boxes=None, labels=None):
	return self.lambd(img, boxes, labels)


	class ConvertFromInts(object):
	def __call__(self, image, boxes=None, labels=None):
	return image.astype(np.float32), boxes, labels


	class SubtractMeans(object):
	def __init__(self, mean):
	self.mean = np.array(mean, dtype=np.float32)

	def __call__(self, image, boxes=None, labels=None):
	image = image.astype(np.float32)
	image -= self.mean
	return image.astype(np.float32), boxes, labels


	class ToUnitNorm(object):
	def __call__(self, image, boxes=None, labels=None):
	image = image.astype(np.float32)
	image /= 255.0
	return image.astype(np.float32), boxes, labels


	class MeanNormalize(object):
	def __init__(self, mean, std):
	self.mean = np.array(mean, dtype=np.float32)
	self.std = np.array(std, dtype=np.float32)

	def __call__(self, image, boxes=None, labels=None):
	image = image.astype(np.float32)
	image -= self.mean
	image /= self.std
	return image.astype(np.float32), boxes, labels


	class ToAbsoluteCoords(object):
	def __call__(self, image, boxes=None, labels=None):
	height, width, channels = image.shape
	boxes[:, 0] *= width
	boxes[:, 2] *= width
	boxes[:, 1] *= height
	boxes[:, 3] *= height

	return image, boxes, labels


	class ToPercentCoords(object):
	def __call__(self, image, boxes=None, labels=None):
	height, width, channels = image.shape
	boxes[:, 0] /= width
	boxes[:, 2] /= width
	boxes[:, 1] /= height
	boxes[:, 3] /= height

	return image, boxes, labels


	class Resize(object):
	def __init__(self, size=300):
	self.size = size

	def __call__(self, image, boxes=None, labels=None):
	image = cv2.resize(image, (self.size,
	self.size))
	return image, boxes, labels


	class RandomSaturation(object):
	def __init__(self, lower=0.5, upper=1.5):
	self.lower = lower
	self.upper = upper
	assert self.upper >= self.lower, "contrast upper must be >= lower."
	assert self.lower >= 0, "contrast lower must be non-negative."

	def __call__(self, image, boxes=None, labels=None):
	if random.randint(2):
	image[:, :, 1] *= random.uniform(self.lower, self.upper)

	return image, boxes, labels


	class RandomHue(object):
	def __init__(self, delta=18.0):
	assert delta >= 0.0 and delta <= 360.0
	self.delta = delta

	def __call__(self, image, boxes=None, labels=None):
	if random.randint(2):
	image[:, :, 0] += random.uniform(-self.delta, self.delta)
	image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
	image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
	return image, boxes, labels


	class RandomLightingNoise(object):
	def __init__(self):
	self.perms = ((0, 1, 2), (0, 2, 1),
	(1, 0, 2), (1, 2, 0),
	(2, 0, 1), (2, 1, 0))

	def __call__(self, image, boxes=None, labels=None):
	if random.randint(2):
	swap = self.perms[random.randint(len(self.perms))]
	shuffle = SwapChannels(swap) # shuffle channels
	image = shuffle(image)
	return image, boxes, labels


	class ConvertColor(object):
	def __init__(self, current='BGR', transform='HSV'):
	self.transform = transform
	self.current = current

	def __call__(self, image, boxes=None, labels=None):
	if self.current == 'BGR' and self.transform == 'HSV':
	image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
	elif self.current == 'HSV' and self.transform == 'BGR':
	image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
	else:
	raise NotImplementedError
	return image, boxes, labels


	class RandomContrast(object):
	def __init__(self, lower=0.5, upper=1.5):
	self.lower = lower
	self.upper = upper
	assert self.upper >= self.lower, "contrast upper must be >= lower."
	assert self.lower >= 0, "contrast lower must be non-negative."

	# expects float image
	def __call__(self, image, boxes=None, labels=None):
	if random.randint(2):
	alpha = random.uniform(self.lower, self.upper)
	image *= alpha
	return image, boxes, labels


	class RandomBrightness(object):
	def __init__(self, delta=32):
	assert delta >= 0.0
	assert delta <= 255.0
	self.delta = delta

	def __call__(self, image, boxes=None, labels=None):
	if random.randint(2):
	delta = random.uniform(-self.delta, self.delta)
	image += delta
	return image, boxes, labels


	class ToCV2Image(object):
	def __call__(self, tensor, boxes=None, labels=None):
	return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels


	class ToTensor(object):
	def __call__(self, cvimage, boxes=None, labels=None):
	return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels


	class RandomSampleCrop(object):
	"""Crop
	Arguments:
	img (Image): the image being input during training
	boxes (Tensor): the original bounding boxes in pt form
	labels (Tensor): the class labels for each bbox
	mode (float tuple): the min and max jaccard overlaps
	Return:
	(img, boxes, classes)
	img (Image): the cropped image
	boxes (Tensor): the adjusted bounding boxes in pt form
	labels (Tensor): the class labels for each bbox
	"""
	def __init__(self):
	self.sample_options = (
	# using entire original input image
	None,
	# sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
	(0.1, None),
	(0.3, None),
	(0.7, None),
	(0.9, None),
	# randomly sample a patch
	(None, None),
	)

	def __call__(self, image, boxes=None, labels=None):
	height, width, _ = image.shape
	while True:
	# randomly choose a mode
	mode = random.choice(self.sample_options)
	if mode is None:
	return image, boxes, labels

	min_iou, max_iou = mode
	if min_iou is None:
	min_iou = float('-inf')
	if max_iou is None:
	max_iou = float('inf')

	# max trails (50)
	for _ in range(50):
	current_image = image

	w = random.uniform(0.3 * width, width)
	h = random.uniform(0.3 * height, height)

	# aspect ratio constraint b/t .5 & 2
	if h / w < 0.5 or h / w > 2:
	continue

	left = random.uniform(width - w)
	top = random.uniform(height - h)

	# convert to integer rect x1,y1,x2,y2
	rect = np.array([int(left), int(top), int(left+w), int(top+h)])

	# calculate IoU (jaccard overlap) b/t the cropped and gt boxes
	overlap = jaccard_numpy(boxes, rect)

	# is min and max overlap constraint satisfied? if not try again
	if overlap.min() < min_iou and max_iou < overlap.max():
	continue

	# cut the crop from the image
	current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
	:]

	# keep overlap with gt box IF center in sampled patch
	centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0

	# mask in all gt boxes that above and to the left of centers
	m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])

	# mask in all gt boxes that under and to the right of centers
	m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])

	# mask in that both m1 and m2 are true
	mask = m1 * m2

	# have any valid boxes? try again if not
	if not mask.any():
	continue

	# take only matching gt boxes
	current_boxes = boxes[mask, :].copy()

	# take only matching gt labels
	current_labels = labels[mask]

	# should we use the box left and top corner or the crop's
	current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
	rect[:2])
	# adjust to crop (by substracting crop's left,top)
	current_boxes[:, :2] -= rect[:2]

	current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
	rect[2:])
	# adjust to crop (by substracting crop's left,top)
	current_boxes[:, 2:] -= rect[:2]

	return current_image, current_boxes, current_labels


	class Expand(object):
	def __init__(self, mean):
	self.mean = mean

	def __call__(self, image, boxes, labels):
	if random.randint(2):
	return image, boxes, labels

	height, width, depth = image.shape
	ratio = random.uniform(1, 4)
	left = random.uniform(0, width*ratio - width)
	top = random.uniform(0, height*ratio - height)

	expand_image = np.zeros(
	(int(heightratio), int(widthratio), depth),
	dtype=image.dtype)
	expand_image[:, :, :] = self.mean
	expand_image[int(top):int(top + height),
	int(left):int(left + width)] = image
	image = expand_image

	boxes = boxes.copy()
	boxes[:, :2] += (int(left), int(top))
	boxes[:, 2:] += (int(left), int(top))

	return image, boxes, labels


	class RandomMirror(object):
	def __call__(self, image, boxes, classes):
	_, width, _ = image.shape
	if random.randint(2):
	image = image[:, ::-1]
	boxes = boxes.copy()
	boxes[:, 0::2] = width - boxes[:, 2::-2]
	return image, boxes, classes


	class SwapChannels(object):
	"""Transforms a tensorized image by swapping the channels in the order
	specified in the swap tuple.
	Args:
	swaps (int triple): final order of channels
	eg: (2, 1, 0)
	"""

	def __init__(self, swaps):
	self.swaps = swaps

	def __call__(self, image):
	"""
	Args:
	image (Tensor): image tensor to be transformed
	Return:
	a tensor with channels swapped according to swap
	"""
	# if torch.is_tensor(image):
	# image = image.data.cpu().numpy()
	# else:
	# image = np.array(image)
	image = image[:, :, self.swaps]
	return image


	class PhotometricDistort(object):
	def __init__(self):
	self.pd = [
	RandomContrast(),
	ConvertColor(transform='HSV'),
	RandomSaturation(),
	RandomHue(),
	ConvertColor(current='HSV', transform='BGR'),
	RandomContrast()
	]
	self.rand_brightness = RandomBrightness()
	self.rand_light_noise = RandomLightingNoise()

	def __call__(self, image, boxes, labels):
	im = image.copy()
	im, boxes, labels = self.rand_brightness(im, boxes, labels)
	if random.randint(2):
	distort = Compose(self.pd[:-1])
	else:
	distort = Compose(self.pd[1:])
	im, boxes, labels = distort(im, boxes, labels)
	return self.rand_light_noise(im, boxes, labels)


	class SSDAugmentation(object):
	def __init__(self, size=160, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)):
	self.mean = mean
	self.std = std
	self.size = size
	self.augment = Compose([
	ConvertFromInts(),
	ToAbsoluteCoords(),
	PhotometricDistort(),
	Expand(self.mean),
	RandomSampleCrop(),
	RandomMirror(),
	ToPercentCoords(),
	Resize(self.size),
	ToUnitNorm(),
	MeanNormalize(self.mean, self.std)
	])

	def __call__(self, img, boxes, labels):
	return self.augment(img, boxes, labels)
	# config.py
	import os.path

	# gets home dir cross platform
	home = os.path.expanduser("~")
	#ddir = os.path.join(home,"data/HollywoodHeads/")
	ddir = os.path.join(home,"data/VOCdevkit/")

	# note: if you used our download scripts, this should be right
	VOCroot = ddir # path to VOCdevkit root dir

	# default batch size
	BATCHES = 64
	# data reshuffled at every epoch
	SHUFFLE = True
	# number of subprocesses to use for data loading
	WORKERS = 4


	#MBNET CONFIG
	v3 = {
	'feature_maps' : [20, 10, 5, 3, 1],

	'min_dim' : 160,

	'steps' : [8, 16, 32, 53, 160],

	'min_sizes' : [16, 30, 60, 90, 130],

	'max_sizes' : [30, 60, 90, 130, 170],

	# 'aspect_ratios' : [[2, 1/2], [2, 1/2, 3, 1/3], [2, 1/2, 3, 1/3],
	# [2, 1/2, 3, 1/3], [2, 1/2], [2, 1/2]],
	'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2]],

	'variance' : [0.1, 0.2],

	'clip' : True,

	'name' : 'v2',
	}

	v2 = {
	'feature_maps' : [38, 19, 10, 5, 3, 1],

	'min_dim' : 300,

	'steps' : [8, 16, 32, 64, 100, 300],

	'min_sizes' : [30, 60, 111, 162, 213, 264],

	'max_sizes' : [60, 111, 162, 213, 264, 315],

	# 'aspect_ratios' : [[2, 1/2], [2, 1/2, 3, 1/3], [2, 1/2, 3, 1/3],
	# [2, 1/2, 3, 1/3], [2, 1/2], [2, 1/2]],
	'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2], [2]],

	'variance' : [0.1, 0.2],

	'clip' : True,

	'name' : 'v2',
	}

	# use average pooling layer as last layer before multibox layers
	v1 = {
	'feature_maps' : [38, 19, 10, 5, 3, 1],

	'min_dim' : 300,

	'steps' : [8, 16, 32, 64, 100, 300],

	'min_sizes' : [30, 60, 114, 168, 222, 276],

	'max_sizes' : [-1, 114, 168, 222, 276, 330],

	# 'aspect_ratios' : [[2], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]],
	'aspect_ratios' : [[1,1,2,1/2],[1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3],
	[1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3],[1,1,2,1/2,3,1/3]],

	'variance' : [0.1, 0.2],

	'clip' : True,

	'name' : 'v1',
	}
	"""Adapted from:
	@longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch
	@rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn
	Licensed under The MIT License [see LICENSE for details]
	"""

	from __future__ import print_function
	import cv2
	import torch
	import torch.nn as nn
	import torch.backends.cudnn as cudnn
	import torchvision.transforms as transforms
	from torch.autograd import Variable
	from data import VOCroot
	from data import VOC_CLASSES as labelmap
	import torch.utils.data as data

	from data import AnnotationTransform, VOCDetection, BaseTransform
	from ssd import build_mbnet

	import sys
	import os
	import time
	import argparse
	import numpy as np
	import pickle
	import cv2

	if sys.version_info[0] == 2:
	import xml.etree.cElementTree as ET
	else:
	import xml.etree.ElementTree as ET

	def str2bool(v):
	return v.lower() in ("yes", "true", "t", "1")

	parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection')
	parser.add_argument('--trained_model', default='weights/ssd300_mAP_77.43_v2.pth',
	type=str, help='Trained state_dict file path to open')
	parser.add_argument('--save_folder', default='eval/', type=str,
	help='File path to save results')
	parser.add_argument('--confidence_threshold', default=0.01, type=float,
	help='Detection confidence threshold')
	parser.add_argument('--top_k', default=5, type=int,
	help='Further restrict the number of predictions to parse')
	parser.add_argument('--cuda', default=True, type=str2bool,
	help='Use cuda to train model')
	parser.add_argument('--voc_root', default=VOCroot, help='Location of VOC root directory')

	args = parser.parse_args()

	if not os.path.exists(args.save_folder):
	os.mkdir(args.save_folder)

	if args.cuda and torch.cuda.is_available():
	torch.set_default_tensor_type('torch.cuda.FloatTensor')
	else:
	torch.set_default_tensor_type('torch.FloatTensor')

	annopath = os.path.join(args.voc_root, 'VOC2007', 'Annotations', '%s.xml')
	imgpath = os.path.join(args.voc_root, 'VOC2007', 'JPEGImages', '%s.jpg')
	imgsetpath = os.path.join(args.voc_root, 'VOC2007', 'ImageSets', 'Main', '{:s}.txt')
	YEAR = '2007'
	devkit_path = VOCroot + 'VOC' + YEAR
	dataset_mean = (0.406, 0.456, 0.485)
	dataset_std = (0.225, 0.224, 0.229)
	set_type = 'test'

	class Timer(object):
	"""A simple timer."""
	def __init__(self):
	self.total_time = 0.
	self.calls = 0
	self.start_time = 0.
	self.diff = 0.
	self.average_time = 0.

	def tic(self):
	# using time.time instead of time.clock because time time.clock
	# does not normalize for multithreading
	self.start_time = time.time()

	def toc(self, average=True):
	self.diff = time.time() - self.start_time
	self.total_time += self.diff
	self.calls += 1
	self.average_time = self.total_time / self.calls
	if average:
	return self.average_time
	else:
	return self.diff


	def parse_rec(filename):
	""" Parse a PASCAL VOC xml file """
	tree = ET.parse(filename)
	objects = []
	for obj in tree.findall('object'):
	obj_struct = {}
	obj_struct['name'] = obj.find('name').text
	obj_struct['pose'] = obj.find('pose').text
	obj_struct['truncated'] = int(obj.find('truncated').text)
	obj_struct['difficult'] = int(obj.find('difficult').text)
	bbox = obj.find('bndbox')
	obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1,
	int(bbox.find('ymin').text) - 1,
	int(bbox.find('xmax').text) - 1,
	int(bbox.find('ymax').text) - 1]
	objects.append(obj_struct)

	return objects


	def get_output_dir(name, phase):
	"""Return the directory where experimental artifacts are placed.
	If the directory does not exist, it is created.
	A canonical path is built using the name from an imdb and a network
	(if not None).
	"""
	filedir = os.path.join(name, phase)
	if not os.path.exists(filedir):
	os.makedirs(filedir)
	return filedir


	def get_voc_results_file_template(image_set, cls):
	# VOCdevkit/VOC2007/results/det_test_aeroplane.txt
	filename = 'det_' + image_set + '_%s.txt' % (cls)
	filedir = os.path.join(devkit_path, 'results')
	if not os.path.exists(filedir):
	os.makedirs(filedir)
	path = os.path.join(filedir, filename)
	return path


	def write_voc_results_file(all_boxes, dataset):
	for cls_ind, cls in enumerate(labelmap):
	print('Writing {:s} VOC results file'.format(cls))
	filename = get_voc_results_file_template(set_type, cls)
	with open(filename, 'wt') as f:
	for im_ind, index in enumerate(dataset.ids):
	dets = all_boxes[cls_ind+1][im_ind]
	if dets == []:
	continue
	# the VOCdevkit expects 1-based indices
	for k in range(dets.shape[0]):
	f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
	format(index[1], dets[k, -1],
	dets[k, 0] + 1, dets[k, 1] + 1,
	dets[k, 2] + 1, dets[k, 3] + 1))


	def do_python_eval(output_dir='output', use_07=True):
	cachedir = os.path.join(devkit_path, 'annotations_cache')
	aps = []
	# The PASCAL VOC metric changed in 2010
	use_07_metric = use_07
	print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
	if not os.path.isdir(output_dir):
	os.mkdir(output_dir)
	for i, cls in enumerate(labelmap):
	filename = get_voc_results_file_template(set_type, cls)
	rec, prec, ap = voc_eval(
	filename, annopath, imgsetpath.format(set_type), cls, cachedir,
	ovthresh=0.5, use_07_metric=use_07_metric)
	aps += [ap]
	print('AP for {} = {:.4f}'.format(cls, ap))
	with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
	pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
	print('Mean AP = {:.4f}'.format(np.mean(aps)))
	print('~~~~~~~~')
	print('Results:')
	for ap in aps:
	print('{:.3f}'.format(ap))
	print('{:.3f}'.format(np.mean(aps)))
	print('~~~~~~~~')
	print('')
	print('--------------------------------------------------------------')
	print('Results computed with the unofficial Python eval code.')
	print('Results should be very close to the official MATLAB eval code.')
	print('--------------------------------------------------------------')


	def voc_ap(rec, prec, use_07_metric=True):
	""" ap = voc_ap(rec, prec, [use_07_metric])
	Compute VOC AP given precision and recall.
	If use_07_metric is true, uses the
	VOC 07 11 point method (default:False).
	"""
	if use_07_metric:
	# 11 point metric
	ap = 0.
	for t in np.arange(0., 1.1, 0.1):
	if np.sum(rec >= t) == 0:
	p = 0
	else:
	p = np.max(prec[rec >= t])
	ap = ap + p / 11.
	else:
	# correct AP calculation
	# first append sentinel values at the end
	mrec = np.concatenate(([0.], rec, [1.]))
	mpre = np.concatenate(([0.], prec, [0.]))

	# compute the precision envelope
	for i in range(mpre.size - 1, 0, -1):
	mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])

	# to calculate area under PR curve, look for points
	# where X axis (recall) changes value
	i = np.where(mrec[1:] != mrec[:-1])[0]

	# and sum (\Delta recall) * prec
	ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
	return ap


	def voc_eval(detpath,
	annopath,
	imagesetfile,
	classname,
	cachedir,
	ovthresh=0.5,
	use_07_metric=True):
	"""rec, prec, ap = voc_eval(detpath,
	annopath,
	imagesetfile,
	classname,
	[ovthresh],
	[use_07_metric])
	Top level function that does the PASCAL VOC evaluation.
	detpath: Path to detections
	detpath.format(classname) should produce the detection results file.
	annopath: Path to annotations
	annopath.format(imagename) should be the xml annotations file.
	imagesetfile: Text file containing the list of images, one image per line.
	classname: Category name (duh)
	cachedir: Directory for caching the annotations
	[ovthresh]: Overlap threshold (default = 0.5)
	[use_07_metric]: Whether to use VOC07's 11 point AP computation
	(default False)
	"""
	# assumes detections are in detpath.format(classname)
	# assumes annotations are in annopath.format(imagename)
	# assumes imagesetfile is a text file with each line an image name
	# cachedir caches the annotations in a pickle file
	# first load gt
	if not os.path.isdir(cachedir):
	os.mkdir(cachedir)
	cachefile = os.path.join(cachedir, 'annots.pkl')
	# read list of images
	with open(imagesetfile, 'r') as f:
	lines = f.readlines()
	imagenames = [x.strip() for x in lines]
	if not os.path.isfile(cachefile):
	# load annots
	recs = {}
	for i, imagename in enumerate(imagenames):
	recs[imagename] = parse_rec(annopath % (imagename))
	if i % 100 == 0:
	print('Reading annotation for {:d}/{:d}'.format(
	i + 1, len(imagenames)))
	# save
	print('Saving cached annotations to {:s}'.format(cachefile))
	with open(cachefile, 'wb') as f:
	pickle.dump(recs, f)
	else:
	# load
	with open(cachefile, 'rb') as f:
	recs = pickle.load(f)

	# extract gt objects for this class
	class_recs = {}
	npos = 0
	for imagename in imagenames:
	R = [obj for obj in recs[imagename] if obj['name'] == classname]
	bbox = np.array([x['bbox'] for x in R])
	difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
	det = [False] * len(R)
	npos = npos + sum(~difficult)
	class_recs[imagename] = {'bbox': bbox,
	'difficult': difficult,
	'det': det}

	# read dets
	detfile = detpath.format(classname)
	with open(detfile, 'r') as f:
	lines = f.readlines()
	if any(lines) == 1:

	splitlines = [x.strip().split(' ') for x in lines]
	image_ids = [x[0] for x in splitlines]
	confidence = np.array([float(x[1]) for x in splitlines])
	BB = np.array([[float(z) for z in x[2:]] for x in splitlines])

	# sort by confidence
	sorted_ind = np.argsort(-confidence)
	sorted_scores = np.sort(-confidence)
	BB = BB[sorted_ind, :]
	image_ids = [image_ids[x] for x in sorted_ind]

	# go down dets and mark TPs and FPs
	nd = len(image_ids)
	tp = np.zeros(nd)
	fp = np.zeros(nd)
	for d in range(nd):
	R = class_recs[image_ids[d]]
	bb = BB[d, :].astype(float)
	ovmax = -np.inf
	BBGT = R['bbox'].astype(float)
	if BBGT.size > 0:
	# compute overlaps
	# intersection
	ixmin = np.maximum(BBGT[:, 0], bb[0])
	iymin = np.maximum(BBGT[:, 1], bb[1])
	ixmax = np.minimum(BBGT[:, 2], bb[2])
	iymax = np.minimum(BBGT[:, 3], bb[3])
	iw = np.maximum(ixmax - ixmin, 0.)
	ih = np.maximum(iymax - iymin, 0.)
	inters = iw * ih
	uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) +
	(BBGT[:, 2] - BBGT[:, 0]) *
	(BBGT[:, 3] - BBGT[:, 1]) - inters)
	overlaps = inters / uni
	ovmax = np.max(overlaps)
	jmax = np.argmax(overlaps)

	if ovmax > ovthresh:
	if not R['difficult'][jmax]:
	if not R['det'][jmax]:
	tp[d] = 1.
	R['det'][jmax] = 1
	else:
	fp[d] = 1.
	else:
	fp[d] = 1.

	# compute precision recall
	fp = np.cumsum(fp)
	tp = np.cumsum(tp)
	rec = tp / float(npos)
	# avoid divide by zero in case the first detection matches a difficult
	# ground truth
	prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
	ap = voc_ap(rec, prec, use_07_metric)
	else:
	rec = -1.
	prec = -1.
	ap = -1.

	return rec, prec, ap


	def test_net(save_folder, net, cuda, dataset, transform, top_k,
	im_size=300, thresh=0.05):
	"""Test a Fast R-CNN network on an image database."""
	num_images = len(dataset)
	# all detections are collected into:
	# all_boxes[cls][image] = N x 5 array of detections in
	# (x1, y1, x2, y2, score)
	all_boxes = [[[] for _ in range(num_images)]
	for _ in range(len(labelmap)+1)]

	# timers
	_t = {'im_detect': Timer(), 'misc': Timer()}
	output_dir = get_output_dir('ssd300_120000', set_type)
	det_file = os.path.join(output_dir, 'detections.pkl')

	for i in range(num_images):
	im, gt, h, w = dataset.pull_item(i)

	x = Variable(im.unsqueeze(0))
	if args.cuda:
	x = x.cuda()
	_t['im_detect'].tic()
	detections = net(x).data
	detect_time = _t['im_detect'].toc(average=False)

	# skip j = 0, because it's the background class
	for j in range(1, detections.size(1)):
	dets = detections[0, j, :]
	mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t()
	dets = torch.masked_select(dets, mask).view(-1, 5)
	if dets.dim() == 0:
	continue
	boxes = dets[:, 1:]
	boxes[:, 0] *= w
	boxes[:, 2] *= w
	boxes[:, 1] *= h
	boxes[:, 3] *= h
	scores = dets[:, 0].cpu().numpy()
	cls_dets = np.hstack((boxes.cpu().numpy(), scores[:, np.newaxis])) \
	.astype(np.float32, copy=False)
	all_boxes[j][i] = cls_dets

	print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1,
	num_images, detect_time))

	with open(det_file, 'wb') as f:
	pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL)

	print('Evaluating detections')
	evaluate_detections(all_boxes, output_dir, dataset)


	def evaluate_detections(box_list, output_dir, dataset):
	write_voc_results_file(box_list, dataset)
	do_python_eval(output_dir)


	if __name__ == '__main__':
	# load net
	net = build_mbnet('test', 160, 21) # initialize SSD
	net.load_state_dict(torch.load(args.trained_model))
	net.eval()
	print('Finished loading model!')
	# load data
	dataset = VOCDetection(args.voc_root, [('2007', set_type)], BaseTransform(160, dataset_mean, dataset_std), AnnotationTransform())
	if args.cuda:
	net = net.cuda()
	cudnn.benchmark = True
	# evaluation
	test_net(args.save_folder, net, args.cuda, dataset,
	BaseTransform(net.size, dataset_mean, dataset_std), args.top_k, 160,
	thresh=args.confidence_threshold)
	import torch
	import torch.nn as nn
	import torch.nn.init as init
	import torch.nn.functional as F
	from torch.autograd import Variable
	from layers import *
	from data import v3
	import os


	def weight_init(m):
	if isinstance(m, nn.Conv2d):
	init.xavier_uniform(m.weight.data)

	class MBNet(nn.Module):
	"""Single Shot Multibox Architecture
	The network is composed of a base MBNet network followed by the
	added multibox conv layers. Each multibox layer branches into
	1) conv2d for class conf scores
	2) conv2d for localization predictions
	3) associated priorbox layer to produce default bounding
	boxes specific to the layer's feature map size.
	See: https://arxiv.org/pdf/1512.02325.pdf for more details.

	Args:
	phase: (string) Can be "test" or "train"
	"""

	def __init__(self, phase, num_classes):
	super(MBNet, self).__init__()
	self.phase = phase
	self.num_classes = num_classes
	# TODO: implement __call__ in PriorBox
	self.priorbox = PriorBox(v3)
	self.priors = Variable(self.priorbox.forward(), volatile=True)
	self.size = 160

	# SSD network
	def conv_bn(inp, oup, stride):
	return nn.Sequential(
	nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
	nn.BatchNorm2d(oup),
	nn.ReLU(inplace=True)
	)

	def conv_dw(inp, oup, stride):
	return nn.Sequential(
	nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
	nn.BatchNorm2d(inp),
	nn.ReLU(inplace=True),

	nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
	nn.BatchNorm2d(oup),
	nn.ReLU(inplace=True),
	)

	def conv_mb(inp, mid, oup, stride, pad=1):
	return nn.Sequential(
	nn.Conv2d(inp, mid, 1, 1, 0, bias=False),
	nn.BatchNorm2d(mid),
	nn.LeakyReLU(inplace=True),

	nn.Conv2d(mid, oup, 3, stride, pad, bias=False),
	nn.BatchNorm2d(oup),
	nn.LeakyReLU(inplace=True),
	)

	self.head = nn.ModuleList([
	conv_bn( 3, 32, 2),
	conv_dw( 32, 64, 1),
	conv_dw( 64, 128, 2),
	conv_dw(128, 128, 1),
	conv_dw(128, 256, 2),
	conv_dw(256, 256, 1),
	conv_dw(256, 512, 2),
	conv_dw(512, 512, 1)
	])

	self.ssdconv1 = conv_mb(512, 128, 256, 2)
	self.ssdconv2 = conv_mb(256, 128, 256, 1, 0)
	self.ssdconv3 = conv_mb(256, 64, 128, 1, 0)

	#xavier init
	self.ssdconv1.apply(weight_init)
	self.ssdconv2.apply(weight_init)
	self.ssdconv3.apply(weight_init)

	# Add localization and confidence lists
	self.loc = nn.ModuleList([
	nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1),
	nn.Conv2d(512, 6 * 4, kernel_size=3, padding=1),
	nn.Conv2d(256, 6 * 4, kernel_size=3, padding=1),
	nn.Conv2d(256, 6 * 4, kernel_size=3, padding=1),
	nn.Conv2d(128, 4 * 4, kernel_size=3, padding=1)
	])

	self.conf = nn.ModuleList([
	nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1),
	nn.Conv2d(512, 6 * num_classes, kernel_size=3, padding=1),
	nn.Conv2d(256, 6 * num_classes, kernel_size=3, padding=1),
	nn.Conv2d(256, 6 * num_classes, kernel_size=3, padding=1),
	nn.Conv2d(128, 4 * num_classes, kernel_size=3, padding=1)
	])

	if phase == 'test':
	self.softmax = nn.Softmax()
	self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)

	def forward(self, x):
	"""Applies network layers and ops on input image(s) x.

	Args:
	x: input image or batch of images. Shape: [batch,3*batch,300,300].

	Return:
	Depending on phase:
	test:
	Variable(tensor) of output class label predictions,
	confidence score, and corresponding location predictions for
	each object detected. Shape: [batch,topk,7]

	train:
	list of concat outputs from:
	1: confidence layers, Shape: [batch*num_priors,num_classes]
	2: localization layers, Shape: [batch,num_priors*4]
	3: priorbox layers, Shape: [2,num_priors*4]
	"""
	sources = list()
	loc = list()
	conf = list()

	for i in range(0, 6):
	x = self.head[i](x)
	sources.append(x)
	for i in range(6, 8):
	x = self.head[i](x)
	sources.append(x)
	x = self.ssdconv1(x)
	sources.append(x)
	x = self.ssdconv2(x)
	sources.append(x)
	x = self.ssdconv3(x)
	sources.append(x)

	# apply multibox head to source layers
	for (x, l, c) in zip(sources, self.loc, self.conf):
	loc.append(l(x).permute(0, 2, 3, 1).contiguous())
	conf.append(c(x).permute(0, 2, 3, 1).contiguous())

	loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
	conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)

	if self.phase == "test":
	output = self.detect(
	loc.view(loc.size(0), -1, 4), # loc preds
	self.softmax(conf.view(-1, self.num_classes)), # conf preds
	self.priors # default boxes
	)
	else:
	output = (
	loc.view(loc.size(0), -1, 4),
	conf.view(conf.size(0), -1, self.num_classes),
	self.priors
	)
	return output

	def load_weights(self, base_file):
	other, ext = os.path.splitext(base_file)
	if ext == '.pkl' or '.pth':
	print('Loading weights into state dict...')
	self.load_state_dict(torch.load(base_file, map_location=lambda storage, loc: storage))
	print('Finished!')
	else:
	print('Sorry only .pth and .pkl files supported.')


	def build_mbnet(phase, size=160, num_classes=2):
	if phase != "test" and phase != "train":
	print("Error: Phase not recognized")
	return
	if size != 160:
	print("Error: Sorry only MBNet160 is supported currently!")
	return

	return MBNet(phase, num_classes)