Skip to content

Instantly share code, notes, and snippets.

@vsubhashini
Last active February 22, 2021 03:30
Show Gist options
  • Star 14 You must be signed in to star a gist
  • Fork 6 You must be signed in to fork a gist
  • Save vsubhashini/3761b9ad43f60db9ac3d to your computer and use it in GitHub Desktop.
Save vsubhashini/3761b9ad43f60db9ac3d to your computer and use it in GitHub Desktop.
Translating Videos to Natural Language Using Deep Recurrent Neural Networks

##Translating Videos to Natural Language Using Deep Recurrent Neural Networks

Paper : NAACL-HLT 2015 PDF

Download Model: NAACL15_VGG_MEAN_POOL_MODEL (220MB)

Project Page

Description

The model is an improved version of the mean pooled model described in the NAACL-HLT 2015 paper. It uses video frame features from the VGG-16 layer model. This is trained only on the Youtube video dataset.

Translating Videos to Natural Language Using Deep Recurrent Neural Networks
S. Venugopalan, H. Xu, J. Donahue, M. Rohrbach, R. Mooney, K. Saenko
North American Chapter of the Association for Computational Linguistics – Human Language Technologies
NAACL-HLT 2015

Please consider citing the above paper if you use this model.

Performance

The METEOR score of this model is 27.7% on the Youtube (MSVD) video test dataset. (refer to Table 2 in the Sequence to Sequence - Video to Text paper).

Caffe compatibility

The models are currently supported by the recurrent branch of the Caffe fork by Jeff Donahue and Subhashini Venugopalan, but are not yet compatible with master branch of Caffe.

Training

More details on the code and data can be found on this Project Page.

The prototxts for the network and solver can also be found here: https://github.com/vsubhashini/caffe/tree/recurrent/examples/youtube

# The network is used for the video description experiments in [1].
# Please consider citing [1] if you use this example in your work.
#
# [1] S. Venugopalan, H. Xu, J. Donahue, M. Rohrbach, R. Mooney, K.Saenko.
# "Translating Videos to Natural Language using Deep Recurrrent Neural
# Networks." NAACL-HLT 2015.
name: "mean_fc7_to_lstm"
layer {
name: "data"
type: "HDF5Data"
top: "mean_fc7"
include { phase: TRAIN }
hdf5_data_param {
source:
"./hdf5/buffer_1_ytprepoolbasis_1/train_batches/hdf5_chunk_list.txt"
batch_size: 100
}
}
layer {
name: "data"
type: "HDF5Data"
top: "cont_sentence"
top: "input_sentence"
top: "target_sentence"
include { phase: TRAIN }
hdf5_data_param {
source: "./hdf5/buffer_100_ytprepool_20/train_batches/hdf5_chunk_list.txt"
batch_size: 20
}
}
layer {
name: "data"
type: "HDF5Data"
top: "mean_fc7"
include {
phase: TEST
stage: "test-on-train"
}
hdf5_data_param {
source:
"./hdf5/buffer_1_ytprepoolbasis_1/train_batches/hdf5_chunk_list.txt"
batch_size: 100
}
}
layer {
name: "data"
type: "HDF5Data"
top: "cont_sentence"
top: "input_sentence"
top: "target_sentence"
include {
phase: TEST
stage: "test-on-train"
}
hdf5_data_param {
source: ./hdf5/buffer_100_ytprepool_20/train_batches/hdf5_chunk_list.txt"
batch_size: 20
}
}
layer {
name: "data"
type: "HDF5Data"
top: "mean_fc7"
include {
phase: TEST
stage: "test-on-val"
}
hdf5_data_param {
source:
"./hdf5/buffer_1_ytprepoolbasis_1/valid_batches/hdf5_chunk_list.txt"
batch_size: 100
}
}
layer {
name: "data"
type: "HDF5Data"
top: "cont_sentence"
top: "input_sentence"
top: "target_sentence"
include {
phase: TEST
stage: "test-on-val"
}
hdf5_data_param {
source: "./hdf5/buffer_100_ytprepool_20/valid_batches/hdf5_chunk_list.txt"
batch_size: 20
}
}
# sentence generation layers
layer {
name: "embedding"
type: "Embed"
bottom: "input_sentence"
top: "embedded_input_sentence"
param {
lr_mult: 1
}
embed_param {
bias_term: false
input_dim: 12594 #youtube_vocab+1
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
}
}
layer {
name: "lstm1"
type: "LSTM"
bottom: "embedded_input_sentence"
bottom: "cont_sentence"
bottom: "mean_fc7"
top: "lstm1"
include { stage: "unfactored" }
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "lstm2"
type: "LSTM"
bottom: "lstm1"
bottom: "cont_sentence"
top: "lstm2"
include {
stage: "unfactored"
stage: "2-layer"
}
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "lstm1"
type: "LSTM"
bottom: "embedded_input_sentence"
bottom: "cont_sentence"
top: "lstm1"
include { stage: "factored" }
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "lstm2"
type: "LSTM"
bottom: "lstm1"
bottom: "cont_sentence"
bottom: "mean_fc7"
top: "lstm2"
include { stage: "factored" }
recurrent_param {
num_output: 1000
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "predict"
type: "InnerProduct"
bottom: "lstm1"
top: "predict"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
exclude { stage: "2-layer" }
inner_product_param {
num_output: 12594 # youtube_vocab + 1
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
axis: 2
}
}
layer {
name: "predict"
type: "InnerProduct"
bottom: "lstm2"
top: "predict"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
include { stage: "2-layer" }
inner_product_param {
num_output: 12594 # youtube_vocab + 1
weight_filler {
type: "uniform"
min: -0.08
max: 0.08
}
bias_filler {
type: "constant"
value: 0
}
axis: 2
}
}
layer {
name: "softmax_loss"
type: "SoftmaxWithLoss"
bottom: "predict"
bottom: "target_sentence"
top: "softmax_loss"
loss_weight: 20
loss_param {
ignore_label: -1
}
softmax_param {
axis: 2
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "predict"
bottom: "target_sentence"
top: "accuracy"
include { phase: TEST }
accuracy_param {
axis: 2
ignore_label: -1
}
}
net: "poolmean.prototxt"
# poolmean.prototxt supports three variants of the architecture:
# (1) stage: 'factored' stage: '2-layer'
# (2) stage: 'unfactored' stage: '1-layer'
# (3) stage: 'unfactored' stage: '2-layer'
# This solver uses variant (1).
# To use a different variant, modify the states (train_state, test_state)
# below as appropriate:
train_state: { stage: 'factored' stage: '2-layer' }
test_iter: 25
test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
test_iter: 25
test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
test_interval: 1000
base_lr: 0.01
lr_policy: "step"
gamma: 0.5
stepsize: 20000
display: 1
max_iter: 60000
momentum: 0.9
weight_decay: 0.0000
snapshot: 1000
snapshot_prefix: "./snapshots/pool_fc7_mean_fac_2layer"
solver_mode: GPU
random_seed: 1701
average_loss: 100
clip_gradients: 10
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment