vsubhashini/poolmean.prototxt

## readme.md

      
    Raw
  

              readme.md
            
          
    ##Translating Videos to Natural Language Using Deep Recurrent Neural Networks
Paper : NAACL-HLT 2015 PDF
Download Model: NAACL15_VGG_MEAN_POOL_MODEL (220MB)
Project Page
Description

The model is an improved version of the mean pooled model described in the
NAACL-HLT 2015 paper. It uses video frame features from the
VGG-16
layer model. This is trained only on the Youtube video dataset.
Translating Videos to Natural Language Using Deep Recurrent Neural Networks
S. Venugopalan, H. Xu, J. Donahue, M. Rohrbach, R. Mooney, K. Saenko
North American Chapter of the Association for Computational Linguistics – Human Language Technologies
NAACL-HLT 2015

Please consider citing the above paper if you use this model.
Performance

The METEOR score of this model is 27.7% on the Youtube (MSVD) video test dataset.
(refer to Table 2 in the Sequence to Sequence - Video to Text
paper).
Caffe compatibility

The models are currently supported by the recurrent branch of the Caffe fork
by Jeff Donahue and
Subhashini Venugopalan, but are not yet
compatible with master branch of Caffe.
Training

More details on the code and data can be found on this Project
Page.
The prototxts for the network and solver can also be found here:
https://github.com/vsubhashini/caffe/tree/recurrent/examples/youtube

  
## poolmean.prototxt
# The network is used for the video description experiments in [1].
# Please consider citing [1] if you use this example in your work.
#
# [1] S. Venugopalan, H. Xu, J. Donahue, M. Rohrbach, R. Mooney, K.Saenko.
#     "Translating Videos to Natural Language using Deep Recurrrent Neural
#     Networks." NAACL-HLT 2015.

name: "mean_fc7_to_lstm"
layer {
  name: "data"
  type: "HDF5Data"
  top: "mean_fc7"
  include { phase: TRAIN }
  hdf5_data_param {
    source:
    "./hdf5/buffer_1_ytprepoolbasis_1/train_batches/hdf5_chunk_list.txt"
    batch_size: 100
  }
}
layer {
  name: "data"
  type: "HDF5Data"
  top: "cont_sentence"
  top: "input_sentence"
  top: "target_sentence"
  include { phase: TRAIN }
  hdf5_data_param {
    source: "./hdf5/buffer_100_ytprepool_20/train_batches/hdf5_chunk_list.txt"
    batch_size: 20
  }
}
layer {
  name: "data"
  type: "HDF5Data"
  top: "mean_fc7"
  include {
    phase: TEST
    stage: "test-on-train"
  }
  hdf5_data_param {
    source:
    "./hdf5/buffer_1_ytprepoolbasis_1/train_batches/hdf5_chunk_list.txt"
    batch_size: 100
  }
}
layer {
  name: "data"
  type: "HDF5Data"
  top: "cont_sentence"
  top: "input_sentence"
  top: "target_sentence"
  include {
    phase: TEST
    stage: "test-on-train"
  }
  hdf5_data_param {
    source: ./hdf5/buffer_100_ytprepool_20/train_batches/hdf5_chunk_list.txt"
    batch_size: 20
  }
}
layer {
  name: "data"
  type: "HDF5Data"
  top: "mean_fc7"
  include {
    phase: TEST
    stage: "test-on-val"
  }
  hdf5_data_param {
    source:
    "./hdf5/buffer_1_ytprepoolbasis_1/valid_batches/hdf5_chunk_list.txt"
    batch_size: 100
  }
}
layer {
  name: "data"
  type: "HDF5Data"
  top: "cont_sentence"
  top: "input_sentence"
  top: "target_sentence"
  include {
    phase: TEST
    stage: "test-on-val"
  }
  hdf5_data_param {
    source: "./hdf5/buffer_100_ytprepool_20/valid_batches/hdf5_chunk_list.txt"
    batch_size: 20
  }
}

# sentence generation layers
layer {
  name: "embedding"
  type: "Embed"
  bottom: "input_sentence"
  top: "embedded_input_sentence"
  param {
    lr_mult: 1
  }
  embed_param {
    bias_term: false
    input_dim: 12594 #youtube_vocab+1
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
  }
}
layer {
  name: "lstm1"
  type: "LSTM"
  bottom: "embedded_input_sentence"
  bottom: "cont_sentence"
  bottom: "mean_fc7"
  top: "lstm1"
  include { stage: "unfactored" }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "lstm2"
  type: "LSTM"
  bottom: "lstm1"
  bottom: "cont_sentence"
  top: "lstm2"
  include {
    stage: "unfactored"
    stage: "2-layer"
  }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "lstm1"
  type: "LSTM"
  bottom: "embedded_input_sentence"
  bottom: "cont_sentence"
  top: "lstm1"
  include { stage: "factored" }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "lstm2"
  type: "LSTM"
  bottom: "lstm1"
  bottom: "cont_sentence"
  bottom: "mean_fc7"
  top: "lstm2"
  include { stage: "factored" }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "predict"
  type: "InnerProduct"
  bottom: "lstm1"
  top: "predict"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  exclude { stage: "2-layer" }
  inner_product_param {
    num_output: 12594 # youtube_vocab + 1
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
    axis: 2
  }
}
layer {
  name: "predict"
  type: "InnerProduct"
  bottom: "lstm2"
  top: "predict"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  include { stage: "2-layer" }
  inner_product_param {
    num_output: 12594 # youtube_vocab + 1
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
    axis: 2
  }
}
layer {
  name: "softmax_loss"
  type: "SoftmaxWithLoss"
  bottom: "predict"
  bottom: "target_sentence"
  top: "softmax_loss"
  loss_weight: 20
  loss_param {
    ignore_label: -1
  }
  softmax_param {
    axis: 2
  }
}
layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "predict"
  bottom: "target_sentence"
  top: "accuracy"
  include { phase: TEST }
  accuracy_param {
    axis: 2
    ignore_label: -1
  }
}

## poolmean_solver.prototxt
net: "poolmean.prototxt"

# poolmean.prototxt supports three variants of the architecture:
# (1) stage: 'factored' stage: '2-layer'
# (2) stage: 'unfactored' stage: '1-layer'
# (3) stage: 'unfactored' stage: '2-layer'
# This solver uses variant (1).
# To use a different variant, modify the states (train_state, test_state)
# below as appropriate:

train_state: { stage: 'factored' stage: '2-layer' }
test_iter: 25
test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
test_iter: 25
test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
test_interval: 1000
base_lr: 0.01
lr_policy: "step"
gamma: 0.5
stepsize: 20000
display: 1
max_iter: 60000
momentum: 0.9
weight_decay: 0.0000
snapshot: 1000
snapshot_prefix: "./snapshots/pool_fc7_mean_fac_2layer"
solver_mode: GPU
random_seed: 1701
average_loss: 100
clip_gradients: 10
	# The network is used for the video description experiments in [1].
	# Please consider citing [1] if you use this example in your work.
	#
	# [1] S. Venugopalan, H. Xu, J. Donahue, M. Rohrbach, R. Mooney, K.Saenko.
	# "Translating Videos to Natural Language using Deep Recurrrent Neural
	# Networks." NAACL-HLT 2015.

	name: "mean_fc7_to_lstm"
	layer {
	name: "data"
	type: "HDF5Data"
	top: "mean_fc7"
	include { phase: TRAIN }
	hdf5_data_param {
	source:
	"./hdf5/buffer_1_ytprepoolbasis_1/train_batches/hdf5_chunk_list.txt"
	batch_size: 100
	}
	}
	layer {
	name: "data"
	type: "HDF5Data"
	top: "cont_sentence"
	top: "input_sentence"
	top: "target_sentence"
	include { phase: TRAIN }
	hdf5_data_param {
	source: "./hdf5/buffer_100_ytprepool_20/train_batches/hdf5_chunk_list.txt"
	batch_size: 20
	}
	}
	layer {
	name: "data"
	type: "HDF5Data"
	top: "mean_fc7"
	include {
	phase: TEST
	stage: "test-on-train"
	}
	hdf5_data_param {
	source:
	"./hdf5/buffer_1_ytprepoolbasis_1/train_batches/hdf5_chunk_list.txt"
	batch_size: 100
	}
	}
	layer {
	name: "data"
	type: "HDF5Data"
	top: "cont_sentence"
	top: "input_sentence"
	top: "target_sentence"
	include {
	phase: TEST
	stage: "test-on-train"
	}
	hdf5_data_param {
	source: ./hdf5/buffer_100_ytprepool_20/train_batches/hdf5_chunk_list.txt"
	batch_size: 20
	}
	}
	layer {
	name: "data"
	type: "HDF5Data"
	top: "mean_fc7"
	include {
	phase: TEST
	stage: "test-on-val"
	}
	hdf5_data_param {
	source:
	"./hdf5/buffer_1_ytprepoolbasis_1/valid_batches/hdf5_chunk_list.txt"
	batch_size: 100
	}
	}
	layer {
	name: "data"
	type: "HDF5Data"
	top: "cont_sentence"
	top: "input_sentence"
	top: "target_sentence"
	include {
	phase: TEST
	stage: "test-on-val"
	}
	hdf5_data_param {
	source: "./hdf5/buffer_100_ytprepool_20/valid_batches/hdf5_chunk_list.txt"
	batch_size: 20
	}
	}

	# sentence generation layers
	layer {
	name: "embedding"
	type: "Embed"
	bottom: "input_sentence"
	top: "embedded_input_sentence"
	param {
	lr_mult: 1
	}
	embed_param {
	bias_term: false
	input_dim: 12594 #youtube_vocab+1
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	}
	}
	layer {
	name: "lstm1"
	type: "LSTM"
	bottom: "embedded_input_sentence"
	bottom: "cont_sentence"
	bottom: "mean_fc7"
	top: "lstm1"
	include { stage: "unfactored" }
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "lstm2"
	type: "LSTM"
	bottom: "lstm1"
	bottom: "cont_sentence"
	top: "lstm2"
	include {
	stage: "unfactored"
	stage: "2-layer"
	}
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "lstm1"
	type: "LSTM"
	bottom: "embedded_input_sentence"
	bottom: "cont_sentence"
	top: "lstm1"
	include { stage: "factored" }
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "lstm2"
	type: "LSTM"
	bottom: "lstm1"
	bottom: "cont_sentence"
	bottom: "mean_fc7"
	top: "lstm2"
	include { stage: "factored" }
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "predict"
	type: "InnerProduct"
	bottom: "lstm1"
	top: "predict"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	exclude { stage: "2-layer" }
	inner_product_param {
	num_output: 12594 # youtube_vocab + 1
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	axis: 2
	}
	}
	layer {
	name: "predict"
	type: "InnerProduct"
	bottom: "lstm2"
	top: "predict"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	include { stage: "2-layer" }
	inner_product_param {
	num_output: 12594 # youtube_vocab + 1
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	axis: 2
	}
	}
	layer {
	name: "softmax_loss"
	type: "SoftmaxWithLoss"
	bottom: "predict"
	bottom: "target_sentence"
	top: "softmax_loss"
	loss_weight: 20
	loss_param {
	ignore_label: -1
	}
	softmax_param {
	axis: 2
	}
	}
	layer {
	name: "accuracy"
	type: "Accuracy"
	bottom: "predict"
	bottom: "target_sentence"
	top: "accuracy"
	include { phase: TEST }
	accuracy_param {
	axis: 2
	ignore_label: -1
	}
	}
	net: "poolmean.prototxt"

	# poolmean.prototxt supports three variants of the architecture:
	# (1) stage: 'factored' stage: '2-layer'
	# (2) stage: 'unfactored' stage: '1-layer'
	# (3) stage: 'unfactored' stage: '2-layer'
	# This solver uses variant (1).
	# To use a different variant, modify the states (train_state, test_state)
	# below as appropriate:

	train_state: { stage: 'factored' stage: '2-layer' }
	test_iter: 25
	test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
	test_iter: 25
	test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
	test_interval: 1000
	base_lr: 0.01
	lr_policy: "step"
	gamma: 0.5
	stepsize: 20000
	display: 1
	max_iter: 60000
	momentum: 0.9
	weight_decay: 0.0000
	snapshot: 1000
	snapshot_prefix: "./snapshots/pool_fc7_mean_fac_2layer"
	solver_mode: GPU
	random_seed: 1701
	average_loss: 100
	clip_gradients: 10