Skip to content

Instantly share code, notes, and snippets.

Created December 2, 2015 00:38
Show Gist options
  • Save victorhcm/ad71ce5b9534a085b299 to your computer and use it in GitHub Desktop.
Save victorhcm/ad71ce5b9534a085b299 to your computer and use it in GitHub Desktop.
# The network is used for the video description experiments of S2VT [1].
# Please consider citing S2VT [1] if you use this example in your work.
# [1] S. Venugopalan, M. Rohrbach, J. Donahue, R. Mooney, T. Darrell,
# K. Saenko. "Sequence to Sequence - Video to Text." ICCV 2015.
# The data is prepared using
# It is in (32) parallel streams.
name: "s2vt"
layer {
name: "data"
type: "HDF5Data"
top: "cont_sentence"
top: "input_sentence"
top: "target_sentence"
top: "stage_indicator"
top: "frame_fc7"
include { phase: TRAIN }
hdf5_data_param {
source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
batch_size: 80
layer {
name: "data"
type: "HDF5Data"
top: "cont_sentence"
top: "input_sentence"
top: "target_sentence"
top: "stage_indicator"
top: "frame_fc7"
include {
phase: TEST
stage: "test-on-train"
hdf5_data_param {
source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
batch_size: 80
layer {
name: "data"
type: "HDF5Data"
top: "cont_sentence"
top: "input_sentence"
top: "target_sentence"
top: "stage_indicator"
top: "frame_fc7"
include {
phase: TEST
stage: "test-on-val"
hdf5_data_param {
source: "./hdf5/buffer_32_s2vt_80/valid_batches/hdf5_chunk_list.txt"
batch_size: 80
layer {
name: "reshape_stg_indicator" # from 80 32 to 80 32 1 to concat
type: "Reshape"
bottom: "stage_indicator"
top: "stage_indicator_3axis"
reshape_param {
shape {
dim: 80
dim: 32
dim: 1
layer {
name: "dropFc7"
type: "Dropout"
bottom: "frame_fc7"
top: "frame_fc7"
dropout_param { dropout_ratio: 0.5 }
include { stage: "dropFc7" }
layer {
name: "embed_encoder"
type: "InnerProduct"
bottom: "frame_fc7"
top: "embedded_input_frames"
param {
lr_mult: 1
decay_mult: 1
param {
lr_mult: 2
decay_mult: 0
inner_product_param {
num_output: 500
weight_filler {
type: "xavier"
bias_filler {
type: "constant"
value: 0.2
axis: -1
layer {
name: "embedding"
type: "Embed"
bottom: "input_sentence"
top: "embedded_input_sentence"
param {
lr_mult: 1
embed_param {
bias_term: false
input_dim: 46168 #youtube_movie_vocab+1
num_output: 500
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
layer {
name: "drop_input_en"
type: "Dropout"
bottom: "embedded_input_sentence"
top: "embedded_input_sentence"
dropout_param { dropout_ratio: 0.5 }
include { stage: "dropEn" }
# unfactored model concat frames and sents
layer {
name: "concat"
type: "Concat"
concat_param { concat_dim: 2 } # concat along h
bottom: "embedded_input_frames"
bottom: "embedded_input_sentence"
bottom: "stage_indicator_3axis"
top: "embedded_input_video_sequence"
include { stage: "unfactored" }
layer {
name: "lstm1"
type: "LSTM"
bottom: "embedded_input_video_sequence"
bottom: "cont_sentence"
top: "lstm1"
include { stage: "unfactored" }
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
bias_filler {
type: "constant"
value: 0
layer {
name: "lstm2"
type: "LSTM"
bottom: "lstm1"
bottom: "cont_sentence"
top: "lstm2"
include {
stage: "unfactored"
stage: "2-layer"
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
bias_filler {
type: "constant"
value: 0
layer {
name: "lstm1"
type: "LSTM"
bottom: "embedded_input_frames"
bottom: "cont_sentence"
top: "lstm1"
include { stage: "factored" }
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
bias_filler {
type: "constant"
value: 0
layer {
name: "drop_lstm1"
type: "Dropout"
bottom: "lstm1"
top: "lstm1"
dropout_param { dropout_ratio: 0.5 }
include { stage: "dropLstm1" }
layer {
name: "concat"
type: "Concat"
concat_param { concat_dim: 2 } # concat along h
bottom: "lstm1"
bottom: "embedded_input_sentence"
bottom: "stage_indicator_3axis"
top: "lstm1_video_sequence"
include { stage: "factored" }
layer {
name: "lstm2"
type: "LSTM"
bottom: "lstm1_video_sequence"
bottom: "cont_sentence"
top: "lstm2"
include { stage: "factored" }
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
bias_filler {
type: "constant"
value: 0
layer {
name: "drop_lstm2"
type: "Dropout"
bottom: "lstm2"
top: "lstm2"
dropout_param { dropout_ratio: 0.5 }
include { stage: "dropLstm2" }
layer {
name: "predict"
type: "InnerProduct"
bottom: "lstm1"
top: "predict"
param {
lr_mult: 1
decay_mult: 1
param {
lr_mult: 2
decay_mult: 0
exclude { stage: "2-layer" }
inner_product_param {
num_output: 46168 # youtube_movie_vocab + 1
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
bias_filler {
type: "constant"
value: 0
axis: 2
layer {
name: "predict"
type: "InnerProduct"
bottom: "lstm2"
top: "predict"
param {
lr_mult: 1
decay_mult: 1
param {
lr_mult: 2
decay_mult: 0
include { stage: "2-layer" }
inner_product_param {
num_output: 46168 # youtube_movie_vocab + 1
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
bias_filler {
type: "constant"
value: 0
axis: 2
layer {
name: "cross_entropy_loss"
type: "SoftmaxWithLoss"
bottom: "predict"
bottom: "target_sentence"
top: "cross_entropy_loss"
loss_weight: 20
loss_param {
ignore_label: -1
softmax_param {
axis: 2
layer {
name: "accuracy"
type: "Accuracy"
bottom: "predict"
bottom: "target_sentence"
top: "accuracy"
include { phase: TEST }
accuracy_param {
axis: 2
ignore_label: -1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment