Skip to content

Instantly share code, notes, and snippets.

@yjxiong
Last active November 14, 2018 00:13
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save yjxiong/4db6fdf84d95c3e41efa to your computer and use it in GitHub Desktop.
Save yjxiong/4db6fdf84d95c3e41efa to your computer and use it in GitHub Desktop.

This gist holds the Caffe style model spec for the CVPR'15 paper

Recognize Complex Events from Static Images by Fusing Deep Channels

The model has two channels, one for appearance analysis, the other one for detection bounding box analysis.

The appearcance analysis channel has the similar structure of the AlexNet and thus is initialized using a model pretrained on ImageNet.


There are some minor differences between this model and the one described in the paper

  • Only person and face detection results are used

The weights for deploy model can be downloaded at

cuhk_wider_deep_channel_fusing.caffemodel


We also provide a simple "Python Layer" to generate the multi-scale spatial map on the fly. It can be used with Caffe in the following fashion

layer {
  type: "Python"
  name: "map_data"
  top: "map_data"
  python_param {
    module: "mss_map_layer"
    layer: "MultiScaleSpatialMapLayer"
    param_str: "{\"source\":\"wider_detection_results/wider_face_person_det_train.json\", \"map_size\": 18, \"batch_size\": 128}"
  }
  include {
    phase: TEST
  }
}

You can download the sample detection result files in json format at

wider_face_person_det_train.json

wider_face_person_det_test.json

These detection results are generated using ACF face detector and LDCF person detector.


Please refer to

WIDER Dataset

and

the CVPR'15 paper

for more details

A standard version of Caffe is required to run the model.

name: "WIDER_Fusion"
input: "image_data"
input_dim: 1
input_dim: 3
input_dim: 224
input_dim: 224
input: "map_data"
input_dim: 1
input_dim: 6
input_dim: 18
input_dim: 18
#Note: This Channel has the same structure of AlexNet
#######################################AlexNet Appearance Analysis Channel##############################################
layer { name: "conv1" type: "Convolution" bottom: "image_data" top: "conv1"
convolution_param { num_output: 96 kernel_size: 11 stride: 4}}
layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1"}
layer { name: "pool1" type: "Pooling" bottom: "conv1" top: "pool1"
pooling_param { pool: MAX kernel_size: 3 stride: 2}}
layer { name: "norm1" type: "LRN" bottom: "pool1" top: "norm1"
lrn_param {local_size: 5 alpha: 0.0001 beta: 0.75}}
layer { name: "conv2" type: "Convolution" bottom: "norm1" top: "conv2"
convolution_param {num_output: 256 pad: 2 kernel_size: 5 group: 2}}
layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2"}
layer { name: "pool2" type: "Pooling" bottom: "conv2" top: "pool2"
pooling_param {pool: MAX kernel_size: 3 stride: 2}}
layer { name: "norm2" type: "LRN" bottom: "pool2" top: "norm2"
lrn_param {local_size: 5 alpha: 0.0001 beta: 0.75}}
layer { name: "conv3" type: "Convolution" bottom: "norm2" top: "conv3"
convolution_param {num_output: 384 pad: 1 kernel_size: 3}}
layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3"}
layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4"
convolution_param {num_output: 384 pad: 1 kernel_size: 3 group: 2}}
layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4"}
layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5"
convolution_param {num_output: 256 pad: 1 kernel_size: 3 group: 2}}
layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5"}
layer { name: "pool5" type: "Pooling" bottom: "conv5" top: "pool5"
pooling_param {pool: MAX kernel_size: 3 stride: 2}}
layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6"
inner_product_param {num_output: 4096}}
layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6"}
layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6"
dropout_param {dropout_ratio: 0.5}}
####################################################### Multi-Scale Spatial Map Channel#################################
layer { name: "map_conv1" type: "Convolution" bottom: "map_data" top: "map_conv1"
convolution_param {num_output: 64 kernel_size: 3 stride: 1 pad: 1}}
layer { name: "map_relu1" type: "ReLU" bottom: "map_conv1" top: "map_conv1"}
layer { name: "map_pool1" type: "Pooling" bottom: "map_conv1" top: "map_pool1"
pooling_param {pool: MAX kernel_size: 3 stride: 3}}
layer { name: "map_conv2" type: "Convolution" bottom: "map_pool1" top: "map_conv2"
convolution_param {num_output: 128 kernel_size: 1 stride: 1}}
layer { name: "map_fc1" type: "InnerProduct" bottom: "map_conv2" top: "map_fc1"
inner_product_param {num_output: 4096}}
layer { name: "map_relu3" type: "ReLU" bottom: "map_fc1" top: "map_fc1"}
layer { name: "map_drop1" type: "Dropout" bottom: "map_fc1" top: "map_drop1"
dropout_param {dropout_ratio: 0.5}}
layer { name: "add" type: "Eltwise" bottom: "map_drop1" bottom: "fc6" top: "fusion_fc6"
eltwise_param {operation: SUM}}
############################################## Classification Branch ###################################################
layer { name: "fc7" type: "InnerProduct" bottom: "fusion_fc6" top: "fc7"
inner_product_param {num_output: 4096}}
layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7"}
layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7"
dropout_param {dropout_ratio: 0.5}}
layer { name: "fc8_event" type: "InnerProduct" bottom: "fc7" top: "fc8"
inner_product_param {num_output: 61}}
layer { type: "Softmax" bottom: "fc8" top: "prob"}
import caffe
import numpy as np
import math
#Layer config is json
try:
import simplejson as json
except ImportError:
import json
face_scale_thresh = np.array([0.1, 0.2,])
person_scale_thresh = np.array([0.2, 0.4,])
offset = 0.01
class MultiScaleSpatialMapLayer(caffe.Layer):
def setup(self, bottom, top):
if len(bottom) >= 1:
raise Exception("This is a data layer, it takes no bottom blob")
self.json_param = json.loads(self.param_str)
self.data_file = self.json_param['source']
self.map_size = self.json_param['map_size']
self.batch_size = self.json_param['batch_size']
self.map_ch = 6
if len(top) == 1:
self.give_gt = False
elif len(top) == 2:
self.give_gt = True
else:
raise Exception("Too many top blobs, expect 1 or 2")
tmp = json.load(open(self.data_file))
self.det_data = tmp['data']
self.data_cursor = iter(self.det_data)
def get_next_img(self):
try:
next_img = self.data_cursor.next()
except StopIteration, e:
#rewind if reached the end
self.data_cursor = iter(self.det_data)
next_img = self.data_cursor.next()
return next_img
def reshape(self, bottom, top):
top[0].reshape(self.batch_size, self.map_ch, self.map_size, self.map_size)
if self.give_gt:
top[1].reshape(self.batch_size, 1)
self.data_buffer = np.empty((self.batch_size, self.map_ch, self.map_size, self.map_size),
dtype=np.float32)
def build_det_map(self, idx, img_info):
img_label = img_info['gt']
def norm_coord(c, ratio, map_size, up=False):
return min(math.ceil(c / ratio), map_size) if up else min(math.floor(c / ratio), map_size)
def norm_bbox(bbox, w_ratio, h_ratio, map_size):
return [ norm_coord(bbox[0], w_ratio, map_size), norm_coord(bbox[1], h_ratio, map_size),
norm_coord(bbox[2], w_ratio, map_size, True), norm_coord(bbox[3], h_ratio, map_size, True) ]
def get_scale(bbox, img_size, scale_ticks):
area = float(bbox[2] - bbox[0]) * float(bbox[3] - bbox[1]) / img_size[0] / img_size[1]
scale = (scale_ticks <= area).sum()
return math.sqrt(scale)
#face
if img_info['face'] is not None:
face_img_size = img_info['face']['size']
w_ratio = face_img_size[0] / float(self.map_size)
h_ratio = face_img_size[1] / float(self.map_size)
for fbox in img_info['face']['bbox']:
nfbox = norm_bbox(fbox, w_ratio, h_ratio, self.map_size)
b_scale = get_scale(fbox, face_img_size, face_scale_thresh)
self.data_buffer[idx, b_scale, nfbox[1]:nfbox[3], nfbox[0]:nfbox[2]] += 1
#person
if img_info['person'] is not None:
person_img_size = img_info['person']['size']
w_ratio = person_img_size[0] / float(self.map_size)
h_ratio = person_img_size[1] / float(self.map_size)
for pbox in img_info['person']['bbox']:
npbox = norm_bbox(pbox, w_ratio, h_ratio, self.map_size)
b_scale = get_scale(pbox, person_img_size, person_scale_thresh)
self.data_buffer[idx, b_scale + 3, npbox[1]:npbox[3], npbox[0]:npbox[2]] += 1
return img_label
def forward(self, bottom, top):
self.data_buffer[:] = offset
gt_label = np.empty((self.batch_size, 1), dtype=np.float32)
for n in xrange(self.batch_size):
img_info = self.get_next_img()
gt_label[n] = self.build_det_map(n, img_info)
# copy buffer to output data
top[0].data[:] = self.data_buffer[:]
if self.give_gt:
top[1].data[:] = gt_label[:]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment