Instantly share code, notes, and snippets.

Embed
What would you like to do?
Sequence to Sequence - Video to Text (S2VT)

##Sequence to Sequence -- Video to Text

Paper : ICCV 2015 PDF

Download Model: S2VT_VGG_RGB_MODEL (333MB)

Project Page

Description

This is the S2VT (RGB) model described in the ICCV 2015 paper "Sequence to Sequence -- Video to Text". It uses video frame features from the VGG-16 layer model. This is trained only on the Youtube video dataset.

Sequence to Sequence - Video to Text
S. Venugopalan, M. Rohrbach, J. Donahue, T. Darrell, R. Mooney, K. Saenko
The IEEE International Conference on Computer Vision (ICCV) 2015

Please consider citing the above paper if you use this model.

Performance

The METEOR score of this model is 29.2% on the Youtube (MSVD) video test dataset. (refer to Table 2 in the Sequence to Sequence - Video to Text paper).

Caffe compatibility

The models are currently supported by the recurrent branch of the Caffe fork by Jeff Donahue and Subhashini Venugopalan, but are not yet compatible with master branch of Caffe.

Training

More details on the code and data can be found on this Project Page.

The prototxts for the network and solver can also be found here: https://github.com/vsubhashini/caffe/tree/recurrent/examples/s2vt

# The network is used for the video description experiments of S2VT [1].
# Please consider citing S2VT [1] if you use this example in your work.
#
# [1] S. Venugopalan, M. Rohrbach, J. Donahue, R. Mooney, T. Darrell,
# K. Saenko. "Sequence to Sequence - Video to Text." ICCV 2015.
# The data is prepared using framefc7_stream_text_to_hdf5.py
# It is in (32) parallel streams.
name: "s2vt"
layer {
name: "data"
type: "HDF5Data"
top: "cont_sentence"
top: "input_sentence"
top: "target_sentence"
top: "stage_indicator"
top: "frame_fc7"
include { phase: TRAIN }
hdf5_data_param {
source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
batch_size: 80
}
}
layer {
name: "data"
type: "HDF5Data"
top: "cont_sentence"
top: "input_sentence"
top: "target_sentence"
top: "stage_indicator"
top: "frame_fc7"
include {
phase: TEST
stage: "test-on-train"
}
hdf5_data_param {
source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
batch_size: 80
}
}
layer {
name: "data"
type: "HDF5Data"
top: "cont_sentence"
top: "input_sentence"
top: "target_sentence"
top: "stage_indicator"
top: "frame_fc7"
include {
phase: TEST
stage: "test-on-val"
}
hdf5_data_param {
source: "./hdf5/buffer_32_s2vt_80/valid_batches/hdf5_chunk_list.txt"
batch_size: 80
}
}
layer {
name: "reshape_stg_indicator" # from 80 32 to 80 32 1 to concat
type: "Reshape"
bottom: "stage_indicator"
top: "stage_indicator_3axis"
reshape_param {
shape {
dim: 80
dim: 32
dim: 1
}
}
}
layer {
name: "dropFc7"
type: "Dropout"
bottom: "frame_fc7"
top: "frame_fc7"
dropout_param { dropout_ratio: 0.5 }
include { stage: "dropFc7" }
}
layer {
name: "embed_encoder"
type: "InnerProduct"
bottom: "frame_fc7"
top: "embedded_input_frames"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 500
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
axis: -1
}
}
layer {
name: "embedding"
type: "Embed"
bottom: "input_sentence"
top: "embedded_input_sentence"
param {
lr_mult: 1
}
embed_param {
bias_term: false
input_dim: 46168 #youtube_movie_vocab+1
num_output: 500
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
}
}
layer {
name: "drop_input_en"
type: "Dropout"
bottom: "embedded_input_sentence"
top: "embedded_input_sentence"
dropout_param { dropout_ratio: 0.5 }
include { stage: "dropEn" }
}
# unfactored model concat frames and sents
layer {
name: "concat"
type: "Concat"
concat_param { concat_dim: 2 } # concat along h
bottom: "embedded_input_frames"
bottom: "embedded_input_sentence"
bottom: "stage_indicator_3axis"
top: "embedded_input_video_sequence"
include { stage: "unfactored" }
}
layer {
name: "lstm1"
type: "LSTM"
bottom: "embedded_input_video_sequence"
bottom: "cont_sentence"
top: "lstm1"
include { stage: "unfactored" }
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "lstm2"
type: "LSTM"
bottom: "lstm1"
bottom: "cont_sentence"
top: "lstm2"
include {
stage: "unfactored"
stage: "2-layer"
}
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "lstm1"
type: "LSTM"
bottom: "embedded_input_frames"
bottom: "cont_sentence"
top: "lstm1"
include { stage: "factored" }
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "drop_lstm1"
type: "Dropout"
bottom: "lstm1"
top: "lstm1"
dropout_param { dropout_ratio: 0.5 }
include { stage: "dropLstm1" }
}
layer {
name: "concat"
type: "Concat"
concat_param { concat_dim: 2 } # concat along h
bottom: "lstm1"
bottom: "embedded_input_sentence"
bottom: "stage_indicator_3axis"
top: "lstm1_video_sequence"
include { stage: "factored" }
}
layer {
name: "lstm2"
type: "LSTM"
bottom: "lstm1_video_sequence"
bottom: "cont_sentence"
top: "lstm2"
include { stage: "factored" }
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "drop_lstm2"
type: "Dropout"
bottom: "lstm2"
top: "lstm2"
dropout_param { dropout_ratio: 0.5 }
include { stage: "dropLstm2" }
}
layer {
name: "predict"
type: "InnerProduct"
bottom: "lstm1"
top: "predict"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
exclude { stage: "2-layer" }
inner_product_param {
num_output: 46168 # youtube_movie_vocab + 1
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
axis: 2
}
}
layer {
name: "predict"
type: "InnerProduct"
bottom: "lstm2"
top: "predict"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
include { stage: "2-layer" }
inner_product_param {
num_output: 46168 # youtube_movie_vocab + 1
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
axis: 2
}
}
layer {
name: "cross_entropy_loss"
type: "SoftmaxWithLoss"
bottom: "predict"
bottom: "target_sentence"
top: "cross_entropy_loss"
loss_weight: 20
loss_param {
ignore_label: -1
}
softmax_param {
axis: 2
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "predict"
bottom: "target_sentence"
top: "accuracy"
include { phase: TEST }
accuracy_param {
axis: 2
ignore_label: -1
}
}
net: "./s2vt.prototxt"
# s2vt.prototxt supports multiple sequence to sequence architectures:
# (1) stage: 'factored' stage: '2-layer'
# (2) stage: 'unfactored' stage: '1-layer'
# (3) stage: 'unfactored' stage: '2-layer'
# Addons:
# (a) stage: 'dropFc7' [input frame feature dropout]
# (b) stage: 'dropEn' [text feature dropout after embedding]
# (c) stage: 'dropLstm1' [dropout on output of lstm1]
# (d) stage: 'dropLstm2' [dropout on output of lstm2]
#
# This solver uses variant (1) which performed best on the youtube dataset.
#
# To use a different variant, modify the states (train_state, test_state)
# below as appropriate:
train_state: { stage: 'factored' stage: '2-layer' }
test_iter: 25
test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
test_iter: 25
test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
test_interval: 1000
base_lr: 0.01
lr_policy: "step"
gamma: 0.5
stepsize: 20000
display: 1
max_iter: 18000
momentum: 0.9
weight_decay: 0.0000
snapshot: 1000
snapshot_prefix: "./snapshots/s2vt_youtube_vgg"
solver_mode: GPU
random_seed: 1701
average_loss: 100
clip_gradients: 10
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment