walkoncross/Sequence to Sequence -- Video to Text

## s2vt.prototxt
# The network is used for the video description experiments of S2VT [1].
# Please consider citing S2VT [1] if you use this example in your work.
#
# [1] S. Venugopalan, M. Rohrbach, J. Donahue, R. Mooney, T. Darrell,
#     K. Saenko. "Sequence to Sequence - Video to Text." ICCV 2015.

# The data is prepared using framefc7_stream_text_to_hdf5.py
# It is in (32)  parallel streams.
name: "s2vt"
layer {
  name: "data"
  type: "HDF5Data"
  top: "cont_sentence"
  top: "input_sentence"
  top: "target_sentence"
  top: "stage_indicator"
  top: "frame_fc7"
  include { phase: TRAIN }
  hdf5_data_param {
    source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
    batch_size: 80
  }
}
layer {
  name: "data"
  type: "HDF5Data"
  top: "cont_sentence"
  top: "input_sentence"
  top: "target_sentence"
  top: "stage_indicator"
  top: "frame_fc7"
  include {
    phase: TEST
    stage: "test-on-train"
  }
  hdf5_data_param {
    source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
    batch_size: 80
  }
}
layer {
  name: "data"
  type: "HDF5Data"
  top: "cont_sentence"
  top: "input_sentence"
  top: "target_sentence"
  top: "stage_indicator"
  top: "frame_fc7"
  include {
    phase: TEST
    stage: "test-on-val"
  }
  hdf5_data_param {
    source: "./hdf5/buffer_32_s2vt_80/valid_batches/hdf5_chunk_list.txt"
    batch_size: 80
  }
}
layer {
  name: "reshape_stg_indicator" # from 80 32 to 80 32 1 to concat
  type: "Reshape"
  bottom: "stage_indicator"
  top: "stage_indicator_3axis"
  reshape_param {
    shape {
      dim: 80
      dim: 32
      dim: 1
    }
  }
}
layer {
  name: "dropFc7"
  type: "Dropout"
  bottom: "frame_fc7"
  top: "frame_fc7"
  dropout_param { dropout_ratio: 0.5 }
  include { stage: "dropFc7" }
}
layer {
  name: "embed_encoder"
  type: "InnerProduct"
  bottom: "frame_fc7"
  top: "embedded_input_frames"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 500
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0.2
    }
    axis: -1
  }
}
layer {
  name: "embedding"
  type: "Embed"
  bottom: "input_sentence"
  top: "embedded_input_sentence"
  param {
    lr_mult: 1
  }
  embed_param {
    bias_term: false
    input_dim: 46168 #youtube_movie_vocab+1
    num_output: 500
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
  }
}
layer {
  name: "drop_input_en"
  type: "Dropout"
  bottom: "embedded_input_sentence"
  top: "embedded_input_sentence"
  dropout_param { dropout_ratio: 0.5 }
  include { stage: "dropEn" }
}
# unfactored model concat frames and sents
layer {
  name: "concat"
  type: "Concat"
  concat_param { concat_dim: 2 } # concat along h
  bottom: "embedded_input_frames"
  bottom: "embedded_input_sentence"
  bottom: "stage_indicator_3axis"
  top: "embedded_input_video_sequence"
  include { stage: "unfactored" }
}
layer {
  name: "lstm1"
  type: "LSTM"
  bottom: "embedded_input_video_sequence"
  bottom: "cont_sentence"
  top: "lstm1"
  include { stage: "unfactored" }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "lstm2"
  type: "LSTM"
  bottom: "lstm1"
  bottom: "cont_sentence"
  top: "lstm2"
  include {
    stage: "unfactored"
    stage: "2-layer"
  }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "lstm1"
  type: "LSTM"
  bottom: "embedded_input_frames"
  bottom: "cont_sentence"
  top: "lstm1"
  include { stage: "factored" }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "drop_lstm1"
  type: "Dropout"
  bottom: "lstm1"
  top: "lstm1"
  dropout_param { dropout_ratio: 0.5 }
  include { stage: "dropLstm1" }
}
layer {
  name: "concat"
  type: "Concat"
  concat_param { concat_dim: 2 } # concat along h
  bottom: "lstm1"
  bottom: "embedded_input_sentence"
  bottom: "stage_indicator_3axis"
  top: "lstm1_video_sequence"
  include { stage: "factored" }
}
layer {
  name: "lstm2"
  type: "LSTM"
  bottom: "lstm1_video_sequence"
  bottom: "cont_sentence"
  top: "lstm2"
  include { stage: "factored" }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "drop_lstm2"
  type: "Dropout"
  bottom: "lstm2"
  top: "lstm2"
  dropout_param { dropout_ratio: 0.5 }
  include { stage: "dropLstm2" }
}
layer {
  name: "predict"
  type: "InnerProduct"
  bottom: "lstm1"
  top: "predict"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  exclude { stage: "2-layer" }
  inner_product_param {
    num_output: 46168 # youtube_movie_vocab + 1
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
    axis: 2
  }
}
layer {
  name: "predict"
  type: "InnerProduct"
  bottom: "lstm2"
  top: "predict"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  include { stage: "2-layer" }
  inner_product_param {
    num_output: 46168 # youtube_movie_vocab + 1
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
    axis: 2
  }
}
layer {
  name: "cross_entropy_loss"
  type: "SoftmaxWithLoss"
  bottom: "predict"
  bottom: "target_sentence"
  top: "cross_entropy_loss"
  loss_weight: 20
  loss_param {
    ignore_label: -1
  }
  softmax_param {
    axis: 2
  }
}
layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "predict"
  bottom: "target_sentence"
  top: "accuracy"
  include { phase: TEST }
  accuracy_param {
    axis: 2
    ignore_label: -1
  }
}

## s2vt_solver.prototxt
net: "./s2vt.prototxt"

# s2vt.prototxt supports multiple sequence to sequence architectures:
# (1) stage: 'factored' stage: '2-layer'
# (2) stage: 'unfactored' stage: '1-layer'
# (3) stage: 'unfactored' stage: '2-layer'
# Addons:
# (a) stage: 'dropFc7'   [input frame feature dropout]
# (b) stage: 'dropEn'    [text feature dropout after embedding]
# (c) stage: 'dropLstm1' [dropout on output of lstm1]
# (d) stage: 'dropLstm2' [dropout on output of lstm2]
#
# This solver uses variant (1) which performed best on the youtube dataset.
#
# To use a different variant, modify the states (train_state, test_state)
# below as appropriate:

train_state: { stage: 'factored' stage: '2-layer' }
test_iter: 25
test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
test_iter: 25
test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
test_interval: 1000
base_lr: 0.01
lr_policy: "step"
gamma: 0.5
stepsize: 20000
display: 1
max_iter: 18000
momentum: 0.9
weight_decay: 0.0000
snapshot: 1000
snapshot_prefix: "./snapshots/s2vt_youtube_vgg"
solver_mode: GPU
random_seed: 1701
average_loss: 100
clip_gradients: 10

## Sequence to Sequence -- Video to Text
##Sequence to Sequence -- Video to Text

Paper : [ICCV 2015 PDF](http://www.cs.utexas.edu/users/ml/papers/venugopalan.iccv15.pdf)

Download Model: [S2VT_VGG_RGB_MODEL](https://www.dropbox.com/s/wn6k2oqurxzt6e2/s2s_vgg_pstream_allvocab_fac2_iter_16000.caffemodel?dl=1) (333MB)

[Project Page](https://vsubhashini.github.io/s2vt.html)

### Description

This is the S2VT (RGB) model described in the ICCV 2015 paper "Sequence to Sequence -- Video to Text". It uses video frame features from the
[VGG-16](https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-md)
layer model. This is trained only on the Youtube video dataset.

    Sequence to Sequence - Video to Text
    S. Venugopalan, M. Rohrbach, J. Donahue, T. Darrell, R. Mooney, K. Saenko
    The IEEE International Conference on Computer Vision (ICCV) 2015

Please consider citing the above paper if you use this model.

### Performance

The METEOR score of this model is 29.2% on the Youtube (MSVD) video test dataset.
(refer to Table 2 in the [Sequence to Sequence - Video to Text
paper](http://www.cs.utexas.edu/users/ml/papers/venugopalan.iccv15.pdf)).

### Caffe compatibility

The models are currently supported by the `recurrent` branch of the Caffe fork
by [Jeff Donahue](https://github.com/jdonahue/caffe.git) and
[Subhashini Venugopalan](https://github.com/vsubhashini/caffe.git), but are not yet
compatible with `master` branch of [Caffe](https://github.com/BVLC/caffe/).

### Training

More details on the code and data can be found on this [Project
Page](https://vsubhashini.github.io/s2vt.html).

The prototxts for the network and solver can also be found here:
https://github.com/vsubhashini/caffe/tree/recurrent/examples/s2vt
	# The network is used for the video description experiments of S2VT [1].
	# Please consider citing S2VT [1] if you use this example in your work.
	#
	# [1] S. Venugopalan, M. Rohrbach, J. Donahue, R. Mooney, T. Darrell,
	# K. Saenko. "Sequence to Sequence - Video to Text." ICCV 2015.

	# The data is prepared using framefc7_stream_text_to_hdf5.py
	# It is in (32) parallel streams.
	name: "s2vt"
	layer {
	name: "data"
	type: "HDF5Data"
	top: "cont_sentence"
	top: "input_sentence"
	top: "target_sentence"
	top: "stage_indicator"
	top: "frame_fc7"
	include { phase: TRAIN }
	hdf5_data_param {
	source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
	batch_size: 80
	}
	}
	layer {
	name: "data"
	type: "HDF5Data"
	top: "cont_sentence"
	top: "input_sentence"
	top: "target_sentence"
	top: "stage_indicator"
	top: "frame_fc7"
	include {
	phase: TEST
	stage: "test-on-train"
	}
	hdf5_data_param {
	source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
	batch_size: 80
	}
	}
	layer {
	name: "data"
	type: "HDF5Data"
	top: "cont_sentence"
	top: "input_sentence"
	top: "target_sentence"
	top: "stage_indicator"
	top: "frame_fc7"
	include {
	phase: TEST
	stage: "test-on-val"
	}
	hdf5_data_param {
	source: "./hdf5/buffer_32_s2vt_80/valid_batches/hdf5_chunk_list.txt"
	batch_size: 80
	}
	}
	layer {
	name: "reshape_stg_indicator" # from 80 32 to 80 32 1 to concat
	type: "Reshape"
	bottom: "stage_indicator"
	top: "stage_indicator_3axis"
	reshape_param {
	shape {
	dim: 80
	dim: 32
	dim: 1
	}
	}
	}
	layer {
	name: "dropFc7"
	type: "Dropout"
	bottom: "frame_fc7"
	top: "frame_fc7"
	dropout_param { dropout_ratio: 0.5 }
	include { stage: "dropFc7" }
	}
	layer {
	name: "embed_encoder"
	type: "InnerProduct"
	bottom: "frame_fc7"
	top: "embedded_input_frames"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	inner_product_param {
	num_output: 500
	weight_filler {
	type: "xavier"
	}
	bias_filler {
	type: "constant"
	value: 0.2
	}
	axis: -1
	}
	}
	layer {
	name: "embedding"
	type: "Embed"
	bottom: "input_sentence"
	top: "embedded_input_sentence"
	param {
	lr_mult: 1
	}
	embed_param {
	bias_term: false
	input_dim: 46168 #youtube_movie_vocab+1
	num_output: 500
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	}
	}
	layer {
	name: "drop_input_en"
	type: "Dropout"
	bottom: "embedded_input_sentence"
	top: "embedded_input_sentence"
	dropout_param { dropout_ratio: 0.5 }
	include { stage: "dropEn" }
	}
	# unfactored model concat frames and sents
	layer {
	name: "concat"
	type: "Concat"
	concat_param { concat_dim: 2 } # concat along h
	bottom: "embedded_input_frames"
	bottom: "embedded_input_sentence"
	bottom: "stage_indicator_3axis"
	top: "embedded_input_video_sequence"
	include { stage: "unfactored" }
	}
	layer {
	name: "lstm1"
	type: "LSTM"
	bottom: "embedded_input_video_sequence"
	bottom: "cont_sentence"
	top: "lstm1"
	include { stage: "unfactored" }
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "lstm2"
	type: "LSTM"
	bottom: "lstm1"
	bottom: "cont_sentence"
	top: "lstm2"
	include {
	stage: "unfactored"
	stage: "2-layer"
	}
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "lstm1"
	type: "LSTM"
	bottom: "embedded_input_frames"
	bottom: "cont_sentence"
	top: "lstm1"
	include { stage: "factored" }
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "drop_lstm1"
	type: "Dropout"
	bottom: "lstm1"
	top: "lstm1"
	dropout_param { dropout_ratio: 0.5 }
	include { stage: "dropLstm1" }
	}
	layer {
	name: "concat"
	type: "Concat"
	concat_param { concat_dim: 2 } # concat along h
	bottom: "lstm1"
	bottom: "embedded_input_sentence"
	bottom: "stage_indicator_3axis"
	top: "lstm1_video_sequence"
	include { stage: "factored" }
	}
	layer {
	name: "lstm2"
	type: "LSTM"
	bottom: "lstm1_video_sequence"
	bottom: "cont_sentence"
	top: "lstm2"
	include { stage: "factored" }
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "drop_lstm2"
	type: "Dropout"
	bottom: "lstm2"
	top: "lstm2"
	dropout_param { dropout_ratio: 0.5 }
	include { stage: "dropLstm2" }
	}
	layer {
	name: "predict"
	type: "InnerProduct"
	bottom: "lstm1"
	top: "predict"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	exclude { stage: "2-layer" }
	inner_product_param {
	num_output: 46168 # youtube_movie_vocab + 1
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	axis: 2
	}
	}
	layer {
	name: "predict"
	type: "InnerProduct"
	bottom: "lstm2"
	top: "predict"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	include { stage: "2-layer" }
	inner_product_param {
	num_output: 46168 # youtube_movie_vocab + 1
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	axis: 2
	}
	}
	layer {
	name: "cross_entropy_loss"
	type: "SoftmaxWithLoss"
	bottom: "predict"
	bottom: "target_sentence"
	top: "cross_entropy_loss"
	loss_weight: 20
	loss_param {
	ignore_label: -1
	}
	softmax_param {
	axis: 2
	}
	}
	layer {
	name: "accuracy"
	type: "Accuracy"
	bottom: "predict"
	bottom: "target_sentence"
	top: "accuracy"
	include { phase: TEST }
	accuracy_param {
	axis: 2
	ignore_label: -1
	}
	}
	net: "./s2vt.prototxt"

	# s2vt.prototxt supports multiple sequence to sequence architectures:
	# (1) stage: 'factored' stage: '2-layer'
	# (2) stage: 'unfactored' stage: '1-layer'
	# (3) stage: 'unfactored' stage: '2-layer'
	# Addons:
	# (a) stage: 'dropFc7' [input frame feature dropout]
	# (b) stage: 'dropEn' [text feature dropout after embedding]
	# (c) stage: 'dropLstm1' [dropout on output of lstm1]
	# (d) stage: 'dropLstm2' [dropout on output of lstm2]
	#
	# This solver uses variant (1) which performed best on the youtube dataset.
	#
	# To use a different variant, modify the states (train_state, test_state)
	# below as appropriate:

	train_state: { stage: 'factored' stage: '2-layer' }
	test_iter: 25
	test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
	test_iter: 25
	test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
	test_interval: 1000
	base_lr: 0.01
	lr_policy: "step"
	gamma: 0.5
	stepsize: 20000
	display: 1
	max_iter: 18000
	momentum: 0.9
	weight_decay: 0.0000
	snapshot: 1000
	snapshot_prefix: "./snapshots/s2vt_youtube_vgg"
	solver_mode: GPU
	random_seed: 1701
	average_loss: 100
	clip_gradients: 10
	##Sequence to Sequence -- Video to Text

	Paper : [ICCV 2015 PDF](http://www.cs.utexas.edu/users/ml/papers/venugopalan.iccv15.pdf)

	Download Model: [S2VT_VGG_RGB_MODEL](https://www.dropbox.com/s/wn6k2oqurxzt6e2/s2s_vgg_pstream_allvocab_fac2_iter_16000.caffemodel?dl=1) (333MB)

	[Project Page](https://vsubhashini.github.io/s2vt.html)

	### Description

	This is the S2VT (RGB) model described in the ICCV 2015 paper "Sequence to Sequence -- Video to Text". It uses video frame features from the
	[VGG-16](https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-md)
	layer model. This is trained only on the Youtube video dataset.

	Sequence to Sequence - Video to Text
	S. Venugopalan, M. Rohrbach, J. Donahue, T. Darrell, R. Mooney, K. Saenko
	The IEEE International Conference on Computer Vision (ICCV) 2015

	Please consider citing the above paper if you use this model.

	### Performance

	The METEOR score of this model is 29.2% on the Youtube (MSVD) video test dataset.
	(refer to Table 2 in the [Sequence to Sequence - Video to Text
	paper](http://www.cs.utexas.edu/users/ml/papers/venugopalan.iccv15.pdf)).

	### Caffe compatibility

	The models are currently supported by the `recurrent` branch of the Caffe fork
	by [Jeff Donahue](https://github.com/jdonahue/caffe.git) and
	[Subhashini Venugopalan](https://github.com/vsubhashini/caffe.git), but are not yet
	compatible with `master` branch of [Caffe](https://github.com/BVLC/caffe/).

	### Training

	More details on the code and data can be found on this [Project
	Page](https://vsubhashini.github.io/s2vt.html).

	The prototxts for the network and solver can also be found here:
	https://github.com/vsubhashini/caffe/tree/recurrent/examples/s2vt