Skip to content

Instantly share code, notes, and snippets.

@walkoncross
Forked from vsubhashini/readme.md
Last active November 21, 2016 23:29
Show Gist options
  • Save walkoncross/689c598b83dc349dda317e2a03e960b3 to your computer and use it in GitHub Desktop.
Save walkoncross/689c598b83dc349dda317e2a03e960b3 to your computer and use it in GitHub Desktop.
Sequence to Sequence - Video to Text (S2VT)
# The network is used for the video description experiments of S2VT [1].
# Please consider citing S2VT [1] if you use this example in your work.
#
# [1] S. Venugopalan, M. Rohrbach, J. Donahue, R. Mooney, T. Darrell,
# K. Saenko. "Sequence to Sequence - Video to Text." ICCV 2015.
# The data is prepared using framefc7_stream_text_to_hdf5.py
# It is in (32) parallel streams.
name: "s2vt"
layer {
name: "data"
type: "HDF5Data"
top: "cont_sentence"
top: "input_sentence"
top: "target_sentence"
top: "stage_indicator"
top: "frame_fc7"
include { phase: TRAIN }
hdf5_data_param {
source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
batch_size: 80
}
}
layer {
name: "data"
type: "HDF5Data"
top: "cont_sentence"
top: "input_sentence"
top: "target_sentence"
top: "stage_indicator"
top: "frame_fc7"
include {
phase: TEST
stage: "test-on-train"
}
hdf5_data_param {
source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
batch_size: 80
}
}
layer {
name: "data"
type: "HDF5Data"
top: "cont_sentence"
top: "input_sentence"
top: "target_sentence"
top: "stage_indicator"
top: "frame_fc7"
include {
phase: TEST
stage: "test-on-val"
}
hdf5_data_param {
source: "./hdf5/buffer_32_s2vt_80/valid_batches/hdf5_chunk_list.txt"
batch_size: 80
}
}
layer {
name: "reshape_stg_indicator" # from 80 32 to 80 32 1 to concat
type: "Reshape"
bottom: "stage_indicator"
top: "stage_indicator_3axis"
reshape_param {
shape {
dim: 80
dim: 32
dim: 1
}
}
}
layer {
name: "dropFc7"
type: "Dropout"
bottom: "frame_fc7"
top: "frame_fc7"
dropout_param { dropout_ratio: 0.5 }
include { stage: "dropFc7" }
}
layer {
name: "embed_encoder"
type: "InnerProduct"
bottom: "frame_fc7"
top: "embedded_input_frames"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 500
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
axis: -1
}
}
layer {
name: "embedding"
type: "Embed"
bottom: "input_sentence"
top: "embedded_input_sentence"
param {
lr_mult: 1
}
embed_param {
bias_term: false
input_dim: 46168 #youtube_movie_vocab+1
num_output: 500
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
}
}
layer {
name: "drop_input_en"
type: "Dropout"
bottom: "embedded_input_sentence"
top: "embedded_input_sentence"
dropout_param { dropout_ratio: 0.5 }
include { stage: "dropEn" }
}
# unfactored model concat frames and sents
layer {
name: "concat"
type: "Concat"
concat_param { concat_dim: 2 } # concat along h
bottom: "embedded_input_frames"
bottom: "embedded_input_sentence"
bottom: "stage_indicator_3axis"
top: "embedded_input_video_sequence"
include { stage: "unfactored" }
}
layer {
name: "lstm1"
type: "LSTM"
bottom: "embedded_input_video_sequence"
bottom: "cont_sentence"
top: "lstm1"
include { stage: "unfactored" }
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "lstm2"
type: "LSTM"
bottom: "lstm1"
bottom: "cont_sentence"
top: "lstm2"
include {
stage: "unfactored"
stage: "2-layer"
}
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "lstm1"
type: "LSTM"
bottom: "embedded_input_frames"
bottom: "cont_sentence"
top: "lstm1"
include { stage: "factored" }
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "drop_lstm1"
type: "Dropout"
bottom: "lstm1"
top: "lstm1"
dropout_param { dropout_ratio: 0.5 }
include { stage: "dropLstm1" }
}
layer {
name: "concat"
type: "Concat"
concat_param { concat_dim: 2 } # concat along h
bottom: "lstm1"
bottom: "embedded_input_sentence"
bottom: "stage_indicator_3axis"
top: "lstm1_video_sequence"
include { stage: "factored" }
}
layer {
name: "lstm2"
type: "LSTM"
bottom: "lstm1_video_sequence"
bottom: "cont_sentence"
top: "lstm2"
include { stage: "factored" }
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "drop_lstm2"
type: "Dropout"
bottom: "lstm2"
top: "lstm2"
dropout_param { dropout_ratio: 0.5 }
include { stage: "dropLstm2" }
}
layer {
name: "predict"
type: "InnerProduct"
bottom: "lstm1"
top: "predict"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
exclude { stage: "2-layer" }
inner_product_param {
num_output: 46168 # youtube_movie_vocab + 1
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
axis: 2
}
}
layer {
name: "predict"
type: "InnerProduct"
bottom: "lstm2"
top: "predict"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
include { stage: "2-layer" }
inner_product_param {
num_output: 46168 # youtube_movie_vocab + 1
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
axis: 2
}
}
layer {
name: "cross_entropy_loss"
type: "SoftmaxWithLoss"
bottom: "predict"
bottom: "target_sentence"
top: "cross_entropy_loss"
loss_weight: 20
loss_param {
ignore_label: -1
}
softmax_param {
axis: 2
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "predict"
bottom: "target_sentence"
top: "accuracy"
include { phase: TEST }
accuracy_param {
axis: 2
ignore_label: -1
}
}
net: "./s2vt.prototxt"
# s2vt.prototxt supports multiple sequence to sequence architectures:
# (1) stage: 'factored' stage: '2-layer'
# (2) stage: 'unfactored' stage: '1-layer'
# (3) stage: 'unfactored' stage: '2-layer'
# Addons:
# (a) stage: 'dropFc7' [input frame feature dropout]
# (b) stage: 'dropEn' [text feature dropout after embedding]
# (c) stage: 'dropLstm1' [dropout on output of lstm1]
# (d) stage: 'dropLstm2' [dropout on output of lstm2]
#
# This solver uses variant (1) which performed best on the youtube dataset.
#
# To use a different variant, modify the states (train_state, test_state)
# below as appropriate:
train_state: { stage: 'factored' stage: '2-layer' }
test_iter: 25
test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
test_iter: 25
test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
test_interval: 1000
base_lr: 0.01
lr_policy: "step"
gamma: 0.5
stepsize: 20000
display: 1
max_iter: 18000
momentum: 0.9
weight_decay: 0.0000
snapshot: 1000
snapshot_prefix: "./snapshots/s2vt_youtube_vgg"
solver_mode: GPU
random_seed: 1701
average_loss: 100
clip_gradients: 10
##Sequence to Sequence -- Video to Text
Paper : [ICCV 2015 PDF](http://www.cs.utexas.edu/users/ml/papers/venugopalan.iccv15.pdf)
Download Model: [S2VT_VGG_RGB_MODEL](https://www.dropbox.com/s/wn6k2oqurxzt6e2/s2s_vgg_pstream_allvocab_fac2_iter_16000.caffemodel?dl=1) (333MB)
[Project Page](https://vsubhashini.github.io/s2vt.html)
### Description
This is the S2VT (RGB) model described in the ICCV 2015 paper "Sequence to Sequence -- Video to Text". It uses video frame features from the
[VGG-16](https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-md)
layer model. This is trained only on the Youtube video dataset.
Sequence to Sequence - Video to Text
S. Venugopalan, M. Rohrbach, J. Donahue, T. Darrell, R. Mooney, K. Saenko
The IEEE International Conference on Computer Vision (ICCV) 2015
Please consider citing the above paper if you use this model.
### Performance
The METEOR score of this model is 29.2% on the Youtube (MSVD) video test dataset.
(refer to Table 2 in the [Sequence to Sequence - Video to Text
paper](http://www.cs.utexas.edu/users/ml/papers/venugopalan.iccv15.pdf)).
### Caffe compatibility
The models are currently supported by the `recurrent` branch of the Caffe fork
by [Jeff Donahue](https://github.com/jdonahue/caffe.git) and
[Subhashini Venugopalan](https://github.com/vsubhashini/caffe.git), but are not yet
compatible with `master` branch of [Caffe](https://github.com/BVLC/caffe/).
### Training
More details on the code and data can be found on this [Project
Page](https://vsubhashini.github.io/s2vt.html).
The prototxts for the network and solver can also be found here:
https://github.com/vsubhashini/caffe/tree/recurrent/examples/s2vt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment