yjxiong/1 - WIDER Deep Channel Fusing.md

## 1 - WIDER Deep Channel Fusing.md

      
    Raw
  

              1 - WIDER Deep Channel Fusing.md
            
          
    This gist holds the Caffe style model spec for the CVPR'15 paper
Recognize Complex Events from Static Images by Fusing Deep Channels
The model has two channels, one for appearance analysis, the other one for detection bounding box analysis.
The appearcance analysis channel has the similar structure of the AlexNet and thus is initialized using a model pretrained on ImageNet.

There are some minor differences between this model and the one described in the paper

Only person and face detection results are used


The weights for deploy model can be downloaded at
cuhk_wider_deep_channel_fusing.caffemodel

We also provide a simple "Python Layer" to generate the multi-scale spatial map on the fly. It can be used with Caffe in the following fashion
layer {
  type: "Python"
  name: "map_data"
  top: "map_data"
  python_param {
    module: "mss_map_layer"
    layer: "MultiScaleSpatialMapLayer"
    param_str: "{\"source\":\"wider_detection_results/wider_face_person_det_train.json\", \"map_size\": 18, \"batch_size\": 128}"
  }
  include {
    phase: TEST
  }
}


You can download the sample detection result files in json format at
wider_face_person_det_train.json
wider_face_person_det_test.json
These detection results are generated using ACF face detector and LDCF person detector.

Please refer to
WIDER Dataset
and
the CVPR'15 paper
for more details
A standard version of Caffe is required to run the model.

  
## cuhk_wider_deep_channel_fusing_deploy.prototxt
name: "WIDER_Fusion"
input: "image_data"
input_dim: 1
input_dim: 3
input_dim: 224
input_dim: 224
input: "map_data"
input_dim: 1
input_dim: 6
input_dim: 18
input_dim: 18
#Note: This Channel has the same structure of AlexNet
#######################################AlexNet Appearance Analysis Channel##############################################
layer { name: "conv1" type: "Convolution" bottom: "image_data" top: "conv1"
  convolution_param { num_output: 96 kernel_size: 11 stride: 4}}
layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1"}
layer { name: "pool1" type: "Pooling" bottom: "conv1" top: "pool1"
  pooling_param { pool: MAX kernel_size: 3 stride: 2}}
layer { name: "norm1" type: "LRN" bottom: "pool1" top: "norm1"
  lrn_param {local_size: 5 alpha: 0.0001 beta: 0.75}}
layer { name: "conv2" type: "Convolution" bottom: "norm1" top: "conv2"
  convolution_param {num_output: 256 pad: 2 kernel_size: 5 group: 2}}
layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2"}
layer { name: "pool2" type: "Pooling" bottom: "conv2" top: "pool2"
  pooling_param {pool: MAX kernel_size: 3 stride: 2}}
layer { name: "norm2" type: "LRN" bottom: "pool2" top: "norm2"
  lrn_param {local_size: 5 alpha: 0.0001 beta: 0.75}}
layer { name: "conv3" type: "Convolution" bottom: "norm2" top: "conv3"
  convolution_param {num_output: 384 pad: 1 kernel_size: 3}}
layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3"}
layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4"
  convolution_param {num_output: 384 pad: 1 kernel_size: 3 group: 2}}
layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4"}
layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5"
  convolution_param {num_output: 256 pad: 1 kernel_size: 3 group: 2}}
layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5"}
layer { name: "pool5" type: "Pooling" bottom: "conv5" top: "pool5"
  pooling_param {pool: MAX kernel_size: 3 stride: 2}}
layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6"
 inner_product_param {num_output: 4096}}
layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6"}
layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6"
  dropout_param {dropout_ratio: 0.5}}
####################################################### Multi-Scale Spatial Map Channel#################################
layer { name: "map_conv1" type: "Convolution" bottom: "map_data" top: "map_conv1"
  convolution_param {num_output: 64 kernel_size: 3 stride: 1 pad: 1}}
layer { name: "map_relu1" type: "ReLU" bottom: "map_conv1" top: "map_conv1"}
layer { name: "map_pool1" type: "Pooling" bottom: "map_conv1" top: "map_pool1"
  pooling_param {pool: MAX kernel_size: 3 stride: 3}}
layer { name: "map_conv2" type: "Convolution" bottom: "map_pool1" top: "map_conv2"
  convolution_param {num_output: 128 kernel_size: 1 stride: 1}}
layer { name: "map_fc1" type: "InnerProduct" bottom: "map_conv2" top: "map_fc1"
  inner_product_param {num_output: 4096}}
layer { name: "map_relu3" type: "ReLU" bottom: "map_fc1" top: "map_fc1"}
layer { name: "map_drop1" type: "Dropout"  bottom: "map_fc1" top: "map_drop1"
  dropout_param {dropout_ratio: 0.5}}
layer { name: "add" type: "Eltwise" bottom: "map_drop1" bottom: "fc6" top: "fusion_fc6"
  eltwise_param {operation: SUM}}
############################################## Classification Branch ###################################################
layer { name: "fc7" type: "InnerProduct" bottom: "fusion_fc6" top: "fc7"
 inner_product_param {num_output: 4096}}
layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7"}
layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7"
  dropout_param {dropout_ratio: 0.5}}
layer { name: "fc8_event" type: "InnerProduct" bottom: "fc7" top: "fc8"
  inner_product_param {num_output: 61}}
layer { type: "Softmax" bottom: "fc8" top: "prob"}

## mss_map_layer.py
import caffe
import numpy as np
import math

#Layer config is json
try:
  import simplejson as json
except ImportError:
  import json

face_scale_thresh = np.array([0.1, 0.2,])
person_scale_thresh = np.array([0.2, 0.4,])
offset = 0.01

class MultiScaleSpatialMapLayer(caffe.Layer):

	def setup(self, bottom, top):
		if len(bottom) >= 1:
			raise Exception("This is a data layer, it takes no bottom blob")

		self.json_param = json.loads(self.param_str)

		self.data_file = self.json_param['source']
		self.map_size = self.json_param['map_size']
		self.batch_size = self.json_param['batch_size']

		self.map_ch = 6

		if len(top) == 1:
			self.give_gt = False
		elif len(top) == 2:
			self.give_gt = True
		else:
			raise Exception("Too many top blobs, expect 1 or 2")

		tmp = json.load(open(self.data_file))
		self.det_data = tmp['data']
		self.data_cursor = iter(self.det_data)

	def get_next_img(self):
		try:
			next_img = self.data_cursor.next()
		except StopIteration, e:
			#rewind if reached the end
			self.data_cursor = iter(self.det_data)
			next_img = self.data_cursor.next()
		return next_img

	def reshape(self, bottom, top):

		top[0].reshape(self.batch_size, self.map_ch, self.map_size, self.map_size)

		if self.give_gt:
			top[1].reshape(self.batch_size, 1)

		self.data_buffer = np.empty((self.batch_size, self.map_ch, self.map_size, self.map_size),
			dtype=np.float32)

	def build_det_map(self, idx, img_info):

		img_label = img_info['gt']

		def norm_coord(c, ratio, map_size, up=False):
			return min(math.ceil(c / ratio), map_size) if up else min(math.floor(c / ratio), map_size)

		def norm_bbox(bbox, w_ratio, h_ratio, map_size):
			return [ norm_coord(bbox[0], w_ratio, map_size), norm_coord(bbox[1], h_ratio, map_size),
							  norm_coord(bbox[2], w_ratio, map_size, True), norm_coord(bbox[3], h_ratio, map_size, True) ]

		def get_scale(bbox, img_size, scale_ticks):
			area = float(bbox[2] - bbox[0]) * float(bbox[3] - bbox[1]) / img_size[0] / img_size[1]
			scale = (scale_ticks <= area).sum()
			return math.sqrt(scale)


		#face
		if img_info['face'] is not None:
			face_img_size = img_info['face']['size']
			w_ratio = face_img_size[0] / float(self.map_size)
			h_ratio = face_img_size[1] / float(self.map_size)
			for fbox in img_info['face']['bbox']:
				nfbox = norm_bbox(fbox, w_ratio, h_ratio, self.map_size)
				b_scale = get_scale(fbox, face_img_size, face_scale_thresh)
				self.data_buffer[idx, b_scale, nfbox[1]:nfbox[3], nfbox[0]:nfbox[2]] += 1

		#person
		if img_info['person'] is not None:
			person_img_size = img_info['person']['size']
			w_ratio = person_img_size[0] / float(self.map_size)
			h_ratio = person_img_size[1] / float(self.map_size)
			for pbox in img_info['person']['bbox']:
				npbox = norm_bbox(pbox, w_ratio, h_ratio, self.map_size)
				b_scale = get_scale(pbox, person_img_size, person_scale_thresh)
				self.data_buffer[idx, b_scale + 3, npbox[1]:npbox[3], npbox[0]:npbox[2]] += 1
		return img_label

	def forward(self, bottom, top):
		self.data_buffer[:] = offset
		gt_label = np.empty((self.batch_size, 1), dtype=np.float32)

		for n in xrange(self.batch_size):
			img_info = self.get_next_img()
			gt_label[n] = self.build_det_map(n, img_info)

		# copy buffer to output data
		top[0].data[:] = self.data_buffer[:]
		if self.give_gt:
			top[1].data[:] = gt_label[:]
	name: "WIDER_Fusion"
	input: "image_data"
	input_dim: 1
	input_dim: 3
	input_dim: 224
	input_dim: 224
	input: "map_data"
	input_dim: 1
	input_dim: 6
	input_dim: 18
	input_dim: 18
	#Note: This Channel has the same structure of AlexNet
	#######################################AlexNet Appearance Analysis Channel##############################################
	layer { name: "conv1" type: "Convolution" bottom: "image_data" top: "conv1"
	convolution_param { num_output: 96 kernel_size: 11 stride: 4}}
	layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1"}
	layer { name: "pool1" type: "Pooling" bottom: "conv1" top: "pool1"
	pooling_param { pool: MAX kernel_size: 3 stride: 2}}
	layer { name: "norm1" type: "LRN" bottom: "pool1" top: "norm1"
	lrn_param {local_size: 5 alpha: 0.0001 beta: 0.75}}
	layer { name: "conv2" type: "Convolution" bottom: "norm1" top: "conv2"
	convolution_param {num_output: 256 pad: 2 kernel_size: 5 group: 2}}
	layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2"}
	layer { name: "pool2" type: "Pooling" bottom: "conv2" top: "pool2"
	pooling_param {pool: MAX kernel_size: 3 stride: 2}}
	layer { name: "norm2" type: "LRN" bottom: "pool2" top: "norm2"
	lrn_param {local_size: 5 alpha: 0.0001 beta: 0.75}}
	layer { name: "conv3" type: "Convolution" bottom: "norm2" top: "conv3"
	convolution_param {num_output: 384 pad: 1 kernel_size: 3}}
	layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3"}
	layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4"
	convolution_param {num_output: 384 pad: 1 kernel_size: 3 group: 2}}
	layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4"}
	layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5"
	convolution_param {num_output: 256 pad: 1 kernel_size: 3 group: 2}}
	layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5"}
	layer { name: "pool5" type: "Pooling" bottom: "conv5" top: "pool5"
	pooling_param {pool: MAX kernel_size: 3 stride: 2}}
	layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6"
	inner_product_param {num_output: 4096}}
	layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6"}
	layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6"
	dropout_param {dropout_ratio: 0.5}}
	####################################################### Multi-Scale Spatial Map Channel#################################
	layer { name: "map_conv1" type: "Convolution" bottom: "map_data" top: "map_conv1"
	convolution_param {num_output: 64 kernel_size: 3 stride: 1 pad: 1}}
	layer { name: "map_relu1" type: "ReLU" bottom: "map_conv1" top: "map_conv1"}
	layer { name: "map_pool1" type: "Pooling" bottom: "map_conv1" top: "map_pool1"
	pooling_param {pool: MAX kernel_size: 3 stride: 3}}
	layer { name: "map_conv2" type: "Convolution" bottom: "map_pool1" top: "map_conv2"
	convolution_param {num_output: 128 kernel_size: 1 stride: 1}}
	layer { name: "map_fc1" type: "InnerProduct" bottom: "map_conv2" top: "map_fc1"
	inner_product_param {num_output: 4096}}
	layer { name: "map_relu3" type: "ReLU" bottom: "map_fc1" top: "map_fc1"}
	layer { name: "map_drop1" type: "Dropout" bottom: "map_fc1" top: "map_drop1"
	dropout_param {dropout_ratio: 0.5}}
	layer { name: "add" type: "Eltwise" bottom: "map_drop1" bottom: "fc6" top: "fusion_fc6"
	eltwise_param {operation: SUM}}
	############################################## Classification Branch ###################################################
	layer { name: "fc7" type: "InnerProduct" bottom: "fusion_fc6" top: "fc7"
	inner_product_param {num_output: 4096}}
	layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7"}
	layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7"
	dropout_param {dropout_ratio: 0.5}}
	layer { name: "fc8_event" type: "InnerProduct" bottom: "fc7" top: "fc8"
	inner_product_param {num_output: 61}}
	layer { type: "Softmax" bottom: "fc8" top: "prob"}
	import caffe
	import numpy as np
	import math

	#Layer config is json
	try:
	import simplejson as json
	except ImportError:
	import json

	face_scale_thresh = np.array([0.1, 0.2,])
	person_scale_thresh = np.array([0.2, 0.4,])
	offset = 0.01

	class MultiScaleSpatialMapLayer(caffe.Layer):

	def setup(self, bottom, top):
	if len(bottom) >= 1:
	raise Exception("This is a data layer, it takes no bottom blob")

	self.json_param = json.loads(self.param_str)

	self.data_file = self.json_param['source']
	self.map_size = self.json_param['map_size']
	self.batch_size = self.json_param['batch_size']

	self.map_ch = 6

	if len(top) == 1:
	self.give_gt = False
	elif len(top) == 2:
	self.give_gt = True
	else:
	raise Exception("Too many top blobs, expect 1 or 2")

	tmp = json.load(open(self.data_file))
	self.det_data = tmp['data']
	self.data_cursor = iter(self.det_data)

	def get_next_img(self):
	try:
	next_img = self.data_cursor.next()
	except StopIteration, e:
	#rewind if reached the end
	self.data_cursor = iter(self.det_data)
	next_img = self.data_cursor.next()
	return next_img

	def reshape(self, bottom, top):

	top[0].reshape(self.batch_size, self.map_ch, self.map_size, self.map_size)

	if self.give_gt:
	top[1].reshape(self.batch_size, 1)

	self.data_buffer = np.empty((self.batch_size, self.map_ch, self.map_size, self.map_size),
	dtype=np.float32)

	def build_det_map(self, idx, img_info):

	img_label = img_info['gt']

	def norm_coord(c, ratio, map_size, up=False):
	return min(math.ceil(c / ratio), map_size) if up else min(math.floor(c / ratio), map_size)

	def norm_bbox(bbox, w_ratio, h_ratio, map_size):
	return [ norm_coord(bbox[0], w_ratio, map_size), norm_coord(bbox[1], h_ratio, map_size),
	norm_coord(bbox[2], w_ratio, map_size, True), norm_coord(bbox[3], h_ratio, map_size, True) ]

	def get_scale(bbox, img_size, scale_ticks):
	area = float(bbox[2] - bbox[0]) * float(bbox[3] - bbox[1]) / img_size[0] / img_size[1]
	scale = (scale_ticks <= area).sum()
	return math.sqrt(scale)


	#face
	if img_info['face'] is not None:
	face_img_size = img_info['face']['size']
	w_ratio = face_img_size[0] / float(self.map_size)
	h_ratio = face_img_size[1] / float(self.map_size)
	for fbox in img_info['face']['bbox']:
	nfbox = norm_bbox(fbox, w_ratio, h_ratio, self.map_size)
	b_scale = get_scale(fbox, face_img_size, face_scale_thresh)
	self.data_buffer[idx, b_scale, nfbox[1]:nfbox[3], nfbox[0]:nfbox[2]] += 1

	#person
	if img_info['person'] is not None:
	person_img_size = img_info['person']['size']
	w_ratio = person_img_size[0] / float(self.map_size)
	h_ratio = person_img_size[1] / float(self.map_size)
	for pbox in img_info['person']['bbox']:
	npbox = norm_bbox(pbox, w_ratio, h_ratio, self.map_size)
	b_scale = get_scale(pbox, person_img_size, person_scale_thresh)
	self.data_buffer[idx, b_scale + 3, npbox[1]:npbox[3], npbox[0]:npbox[2]] += 1
	return img_label

	def forward(self, bottom, top):
	self.data_buffer[:] = offset
	gt_label = np.empty((self.batch_size, 1), dtype=np.float32)

	for n in xrange(self.batch_size):
	img_info = self.get_next_img()
	gt_label[n] = self.build_det_map(n, img_info)

	# copy buffer to output data
	top[0].data[:] = self.data_buffer[:]
	if self.give_gt:
	top[1].data[:] = gt_label[:]