Skip to content

Instantly share code, notes, and snippets.

@yardstick17
Last active November 25, 2023 23:46
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save yardstick17/22c02363c5e04763373b588f1a3bceeb to your computer and use it in GitHub Desktop.
Save yardstick17/22c02363c5e04763373b588f1a3bceeb to your computer and use it in GitHub Desktop.
Spatial pyramid pooling (SPP) is a pooling strategy to result in an output of fixed size. It will turn a 2D input of arbitrary size into an output of fixed dimension. Hence, the convolutional part of a DNN can be connected to a dense part with a fixed number of nodes even if the dimensions of the input image are unknown.
CUSTOM_OUTPUT_CATEGORIES = 2
import keras.backend as K
from keras.engine.topology import Layer
class SpatialPyramidPooling(Layer):
'''Spatial pyramid pooling layer for 2D inputs.
See Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition,
K. He, X. Zhang, S. Ren, J. Sun
# Arguments
pool_list: list of int
List of pooling regions to use. The length of the list is the number of pooling regions,
each int in the list is the number of regions in that pool. For example [1,2,4] would be 3
regions with 1, 2x2 and 4x4 max pools, so 21 outputs per feature map
# Input shape
4D tensor with shape:
`(samples, channels, rows, cols)` if dim_ordering='th'
or 4D tensor with shape:
`(samples, rows, cols, channels)` if dim_ordering='tf'.
# Output shape
2D tensor with shape:
`(samples, channels * sum([i * i for i in pool_list])`
'''
def __init__(self, pool_list, **kwargs):
self.dim_ordering = K.image_dim_ordering()
assert self.dim_ordering in {'tf', 'th'}, 'dim_ordering must be in {tf, th}'
self.pool_list = pool_list
self.num_outputs_per_channel = sum([i * i for i in pool_list])
super(SpatialPyramidPooling, self).__init__(**kwargs)
def build(self, input_shape):
if self.dim_ordering == 'th':
self.nb_channels = input_shape[1]
elif self.dim_ordering == 'tf':
self.nb_channels = input_shape[3]
def get_output_shape_for(self, input_shape):
return (input_shape[0], self.nb_channels * self.num_outputs_per_channel)
def get_config(self):
config = {'pool_list': self.pool_list}
base_config = super(SpatialPyramidPooling, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, x, mask=None):
input_shape = K.shape(x)
if self.dim_ordering == 'th':
num_rows = input_shape[2]
num_cols = input_shape[3]
elif self.dim_ordering == 'tf':
num_rows = input_shape[1]
num_cols = input_shape[2]
row_length = [K.cast(num_rows, 'float32') / i for i in self.pool_list]
col_length = [K.cast(num_cols, 'float32') / i for i in self.pool_list]
outputs = []
if self.dim_ordering == 'th':
for pool_num, num_pool_regions in enumerate(self.pool_list):
for jy in range(num_pool_regions):
for ix in range(num_pool_regions):
x1 = ix * col_length[pool_num]
x2 = ix * col_length[pool_num] + col_length[pool_num]
y1 = jy * row_length[pool_num]
y2 = jy * row_length[pool_num] + row_length[pool_num]
x1 = K.cast(K.round(x1), 'int32')
x2 = K.cast(K.round(x2), 'int32')
y1 = K.cast(K.round(y1), 'int32')
y2 = K.cast(K.round(y2), 'int32')
new_shape = [input_shape[0], input_shape[1],
y2 - y1, x2 - x1]
x_crop = x[:, :, y1:y2, x1:x2]
xm = K.reshape(x_crop, new_shape)
pooled_val = K.max(xm, axis=(2, 3))
outputs.append(pooled_val)
elif self.dim_ordering == 'tf':
for pool_num, num_pool_regions in enumerate(self.pool_list):
for jy in range(num_pool_regions):
for ix in range(num_pool_regions):
x1 = ix * col_length[pool_num]
x2 = ix * col_length[pool_num] + col_length[pool_num]
y1 = jy * row_length[pool_num]
y2 = jy * row_length[pool_num] + row_length[pool_num]
x1 = K.cast(K.round(x1), 'int32')
x2 = K.cast(K.round(x2), 'int32')
y1 = K.cast(K.round(y1), 'int32')
y2 = K.cast(K.round(y2), 'int32')
new_shape = [input_shape[0], y2 - y1,
x2 - x1, input_shape[3]]
x_crop = x[:, y1:y2, x1:x2, :]
xm = K.reshape(x_crop, new_shape)
pooled_val = K.max(xm, axis=(1, 2))
outputs.append(pooled_val)
if self.dim_ordering == 'th':
outputs = K.concatenate(outputs)
elif self.dim_ordering == 'tf':
# outputs = K.concatenate(outputs,axis = 1)
outputs = K.concatenate(outputs)
# outputs = K.reshape(outputs,(len(self.pool_list),self.num_outputs_per_channel,input_shape[0],input_shape[1]))
# outputs = K.permute_dimensions(outputs,(3,1,0,2))
# outputs = K.reshape(outputs,(input_shape[0], self.num_outputs_per_channel * self.nb_channels))
return outputs
def Spp():
# uses theano ordering. Note that we leave the image size as None to allow multiple image sizes
model = Sequential()
model.add(Convolution2D(96, 11, 11, border_mode='same', input_shape=(3, None, None), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Convolution2D(32, 3, 3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Convolution2D(64, 3, 3, border_mode='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Convolution2D(64, 3, 3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(SpatialPyramidPooling([1, 2, 4]))
model.add(Dense(4096, activation='relu', name='dense_1'))
model.add(Dropout(0.5))
model.add(Dense(4096, activation='relu', name='dense_2'))
model.add(Dropout(0.5))
model.add(Dense(CUSTOM_OUTPUT_CATEGORIES, name='dense_3'))
model.add(Activation('softmax'))
return model
@hcl14
Copy link

hcl14 commented Sep 5, 2019

It builds the following model (output_categories = 1, input_shape=(None, None, 3)), so the logits shape is (?,?,?,1)

>>> model.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #
=================================================================
conv2d_1 (Conv2D)            (None, None, None, 96)    11712
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, None, None, 96)    0
_________________________________________________________________
conv2d_2 (Conv2D)            (None, None, None, 32)    27680
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, None, None, 32)    0
_________________________________________________________________
conv2d_3 (Conv2D)            (None, None, None, 64)    18496
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, None, None, 64)    0
_________________________________________________________________
conv2d_4 (Conv2D)            (None, None, None, 64)    36928
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, None, None, 64)    0
_________________________________________________________________
spatial_pyramid_pooling_1 (S (None, None, None, 64)    0
_________________________________________________________________
dense_1 (Dense)              (None, None, None, 4096)  266240
_________________________________________________________________
dropout_1 (Dropout)          (None, None, None, 4096)  0
_________________________________________________________________
dense_2 (Dense)              (None, None, None, 4096)  16781312
_________________________________________________________________
dropout_2 (Dropout)          (None, None, None, 4096)  0
_________________________________________________________________
dense_3 (Dense)              (None, None, None, 1)     4097
_________________________________________________________________
activation_1 (Activation)    (None, None, None, 1)     0
=================================================================
Total params: 17,146,465
Trainable params: 17,146,465
Non-trainable params: 0
_________________________________________________________________

ValueError: logits and labels must have the same shape ((?, 1) vs (?, ?, ?, ?))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment