-
-
Save walkoncross/689c598b83dc349dda317e2a03e960b3 to your computer and use it in GitHub Desktop.
Sequence to Sequence - Video to Text (S2VT)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The network is used for the video description experiments of S2VT [1]. | |
# Please consider citing S2VT [1] if you use this example in your work. | |
# | |
# [1] S. Venugopalan, M. Rohrbach, J. Donahue, R. Mooney, T. Darrell, | |
# K. Saenko. "Sequence to Sequence - Video to Text." ICCV 2015. | |
# The data is prepared using framefc7_stream_text_to_hdf5.py | |
# It is in (32) parallel streams. | |
name: "s2vt" | |
layer { | |
name: "data" | |
type: "HDF5Data" | |
top: "cont_sentence" | |
top: "input_sentence" | |
top: "target_sentence" | |
top: "stage_indicator" | |
top: "frame_fc7" | |
include { phase: TRAIN } | |
hdf5_data_param { | |
source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt" | |
batch_size: 80 | |
} | |
} | |
layer { | |
name: "data" | |
type: "HDF5Data" | |
top: "cont_sentence" | |
top: "input_sentence" | |
top: "target_sentence" | |
top: "stage_indicator" | |
top: "frame_fc7" | |
include { | |
phase: TEST | |
stage: "test-on-train" | |
} | |
hdf5_data_param { | |
source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt" | |
batch_size: 80 | |
} | |
} | |
layer { | |
name: "data" | |
type: "HDF5Data" | |
top: "cont_sentence" | |
top: "input_sentence" | |
top: "target_sentence" | |
top: "stage_indicator" | |
top: "frame_fc7" | |
include { | |
phase: TEST | |
stage: "test-on-val" | |
} | |
hdf5_data_param { | |
source: "./hdf5/buffer_32_s2vt_80/valid_batches/hdf5_chunk_list.txt" | |
batch_size: 80 | |
} | |
} | |
layer { | |
name: "reshape_stg_indicator" # from 80 32 to 80 32 1 to concat | |
type: "Reshape" | |
bottom: "stage_indicator" | |
top: "stage_indicator_3axis" | |
reshape_param { | |
shape { | |
dim: 80 | |
dim: 32 | |
dim: 1 | |
} | |
} | |
} | |
layer { | |
name: "dropFc7" | |
type: "Dropout" | |
bottom: "frame_fc7" | |
top: "frame_fc7" | |
dropout_param { dropout_ratio: 0.5 } | |
include { stage: "dropFc7" } | |
} | |
layer { | |
name: "embed_encoder" | |
type: "InnerProduct" | |
bottom: "frame_fc7" | |
top: "embedded_input_frames" | |
param { | |
lr_mult: 1 | |
decay_mult: 1 | |
} | |
param { | |
lr_mult: 2 | |
decay_mult: 0 | |
} | |
inner_product_param { | |
num_output: 500 | |
weight_filler { | |
type: "xavier" | |
} | |
bias_filler { | |
type: "constant" | |
value: 0.2 | |
} | |
axis: -1 | |
} | |
} | |
layer { | |
name: "embedding" | |
type: "Embed" | |
bottom: "input_sentence" | |
top: "embedded_input_sentence" | |
param { | |
lr_mult: 1 | |
} | |
embed_param { | |
bias_term: false | |
input_dim: 46168 #youtube_movie_vocab+1 | |
num_output: 500 | |
weight_filler { | |
type: "uniform" | |
min: -0.08 | |
max: 0.08 | |
} | |
} | |
} | |
layer { | |
name: "drop_input_en" | |
type: "Dropout" | |
bottom: "embedded_input_sentence" | |
top: "embedded_input_sentence" | |
dropout_param { dropout_ratio: 0.5 } | |
include { stage: "dropEn" } | |
} | |
# unfactored model concat frames and sents | |
layer { | |
name: "concat" | |
type: "Concat" | |
concat_param { concat_dim: 2 } # concat along h | |
bottom: "embedded_input_frames" | |
bottom: "embedded_input_sentence" | |
bottom: "stage_indicator_3axis" | |
top: "embedded_input_video_sequence" | |
include { stage: "unfactored" } | |
} | |
layer { | |
name: "lstm1" | |
type: "LSTM" | |
bottom: "embedded_input_video_sequence" | |
bottom: "cont_sentence" | |
top: "lstm1" | |
include { stage: "unfactored" } | |
recurrent_param { | |
num_output: 1000 | |
weight_filler { | |
type: "uniform" | |
min: -0.08 | |
max: 0.08 | |
} | |
bias_filler { | |
type: "constant" | |
value: 0 | |
} | |
} | |
} | |
layer { | |
name: "lstm2" | |
type: "LSTM" | |
bottom: "lstm1" | |
bottom: "cont_sentence" | |
top: "lstm2" | |
include { | |
stage: "unfactored" | |
stage: "2-layer" | |
} | |
recurrent_param { | |
num_output: 1000 | |
weight_filler { | |
type: "uniform" | |
min: -0.08 | |
max: 0.08 | |
} | |
bias_filler { | |
type: "constant" | |
value: 0 | |
} | |
} | |
} | |
layer { | |
name: "lstm1" | |
type: "LSTM" | |
bottom: "embedded_input_frames" | |
bottom: "cont_sentence" | |
top: "lstm1" | |
include { stage: "factored" } | |
recurrent_param { | |
num_output: 1000 | |
weight_filler { | |
type: "uniform" | |
min: -0.08 | |
max: 0.08 | |
} | |
bias_filler { | |
type: "constant" | |
value: 0 | |
} | |
} | |
} | |
layer { | |
name: "drop_lstm1" | |
type: "Dropout" | |
bottom: "lstm1" | |
top: "lstm1" | |
dropout_param { dropout_ratio: 0.5 } | |
include { stage: "dropLstm1" } | |
} | |
layer { | |
name: "concat" | |
type: "Concat" | |
concat_param { concat_dim: 2 } # concat along h | |
bottom: "lstm1" | |
bottom: "embedded_input_sentence" | |
bottom: "stage_indicator_3axis" | |
top: "lstm1_video_sequence" | |
include { stage: "factored" } | |
} | |
layer { | |
name: "lstm2" | |
type: "LSTM" | |
bottom: "lstm1_video_sequence" | |
bottom: "cont_sentence" | |
top: "lstm2" | |
include { stage: "factored" } | |
recurrent_param { | |
num_output: 1000 | |
weight_filler { | |
type: "uniform" | |
min: -0.08 | |
max: 0.08 | |
} | |
bias_filler { | |
type: "constant" | |
value: 0 | |
} | |
} | |
} | |
layer { | |
name: "drop_lstm2" | |
type: "Dropout" | |
bottom: "lstm2" | |
top: "lstm2" | |
dropout_param { dropout_ratio: 0.5 } | |
include { stage: "dropLstm2" } | |
} | |
layer { | |
name: "predict" | |
type: "InnerProduct" | |
bottom: "lstm1" | |
top: "predict" | |
param { | |
lr_mult: 1 | |
decay_mult: 1 | |
} | |
param { | |
lr_mult: 2 | |
decay_mult: 0 | |
} | |
exclude { stage: "2-layer" } | |
inner_product_param { | |
num_output: 46168 # youtube_movie_vocab + 1 | |
weight_filler { | |
type: "uniform" | |
min: -0.08 | |
max: 0.08 | |
} | |
bias_filler { | |
type: "constant" | |
value: 0 | |
} | |
axis: 2 | |
} | |
} | |
layer { | |
name: "predict" | |
type: "InnerProduct" | |
bottom: "lstm2" | |
top: "predict" | |
param { | |
lr_mult: 1 | |
decay_mult: 1 | |
} | |
param { | |
lr_mult: 2 | |
decay_mult: 0 | |
} | |
include { stage: "2-layer" } | |
inner_product_param { | |
num_output: 46168 # youtube_movie_vocab + 1 | |
weight_filler { | |
type: "uniform" | |
min: -0.08 | |
max: 0.08 | |
} | |
bias_filler { | |
type: "constant" | |
value: 0 | |
} | |
axis: 2 | |
} | |
} | |
layer { | |
name: "cross_entropy_loss" | |
type: "SoftmaxWithLoss" | |
bottom: "predict" | |
bottom: "target_sentence" | |
top: "cross_entropy_loss" | |
loss_weight: 20 | |
loss_param { | |
ignore_label: -1 | |
} | |
softmax_param { | |
axis: 2 | |
} | |
} | |
layer { | |
name: "accuracy" | |
type: "Accuracy" | |
bottom: "predict" | |
bottom: "target_sentence" | |
top: "accuracy" | |
include { phase: TEST } | |
accuracy_param { | |
axis: 2 | |
ignore_label: -1 | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
net: "./s2vt.prototxt" | |
# s2vt.prototxt supports multiple sequence to sequence architectures: | |
# (1) stage: 'factored' stage: '2-layer' | |
# (2) stage: 'unfactored' stage: '1-layer' | |
# (3) stage: 'unfactored' stage: '2-layer' | |
# Addons: | |
# (a) stage: 'dropFc7' [input frame feature dropout] | |
# (b) stage: 'dropEn' [text feature dropout after embedding] | |
# (c) stage: 'dropLstm1' [dropout on output of lstm1] | |
# (d) stage: 'dropLstm2' [dropout on output of lstm2] | |
# | |
# This solver uses variant (1) which performed best on the youtube dataset. | |
# | |
# To use a different variant, modify the states (train_state, test_state) | |
# below as appropriate: | |
train_state: { stage: 'factored' stage: '2-layer' } | |
test_iter: 25 | |
test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' } | |
test_iter: 25 | |
test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' } | |
test_interval: 1000 | |
base_lr: 0.01 | |
lr_policy: "step" | |
gamma: 0.5 | |
stepsize: 20000 | |
display: 1 | |
max_iter: 18000 | |
momentum: 0.9 | |
weight_decay: 0.0000 | |
snapshot: 1000 | |
snapshot_prefix: "./snapshots/s2vt_youtube_vgg" | |
solver_mode: GPU | |
random_seed: 1701 | |
average_loss: 100 | |
clip_gradients: 10 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##Sequence to Sequence -- Video to Text | |
Paper : [ICCV 2015 PDF](http://www.cs.utexas.edu/users/ml/papers/venugopalan.iccv15.pdf) | |
Download Model: [S2VT_VGG_RGB_MODEL](https://www.dropbox.com/s/wn6k2oqurxzt6e2/s2s_vgg_pstream_allvocab_fac2_iter_16000.caffemodel?dl=1) (333MB) | |
[Project Page](https://vsubhashini.github.io/s2vt.html) | |
### Description | |
This is the S2VT (RGB) model described in the ICCV 2015 paper "Sequence to Sequence -- Video to Text". It uses video frame features from the | |
[VGG-16](https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-md) | |
layer model. This is trained only on the Youtube video dataset. | |
Sequence to Sequence - Video to Text | |
S. Venugopalan, M. Rohrbach, J. Donahue, T. Darrell, R. Mooney, K. Saenko | |
The IEEE International Conference on Computer Vision (ICCV) 2015 | |
Please consider citing the above paper if you use this model. | |
### Performance | |
The METEOR score of this model is 29.2% on the Youtube (MSVD) video test dataset. | |
(refer to Table 2 in the [Sequence to Sequence - Video to Text | |
paper](http://www.cs.utexas.edu/users/ml/papers/venugopalan.iccv15.pdf)). | |
### Caffe compatibility | |
The models are currently supported by the `recurrent` branch of the Caffe fork | |
by [Jeff Donahue](https://github.com/jdonahue/caffe.git) and | |
[Subhashini Venugopalan](https://github.com/vsubhashini/caffe.git), but are not yet | |
compatible with `master` branch of [Caffe](https://github.com/BVLC/caffe/). | |
### Training | |
More details on the code and data can be found on this [Project | |
Page](https://vsubhashini.github.io/s2vt.html). | |
The prototxts for the network and solver can also be found here: | |
https://github.com/vsubhashini/caffe/tree/recurrent/examples/s2vt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
http://ethereon.github.io/netscope/#/gist/689c598b83dc349dda317e2a03e960b3