victorhcm/s2vt_lstm.prototxt

## s2vt_lstm.prototxt
# The network is used for the video description experiments of S2VT [1].
# Please consider citing S2VT [1] if you use this example in your work.
#
# [1] S. Venugopalan, M. Rohrbach, J. Donahue, R. Mooney, T. Darrell,
#     K. Saenko. "Sequence to Sequence - Video to Text." ICCV 2015.

# The data is prepared using framefc7_stream_text_to_hdf5.py
# It is in (32)  parallel streams.
name: "s2vt"
layer {
  name: "data"
  type: "HDF5Data"
  top: "cont_sentence"
  top: "input_sentence"
  top: "target_sentence"
  top: "stage_indicator"
  top: "frame_fc7"
  include { phase: TRAIN }
  hdf5_data_param {
    source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
    batch_size: 80
  }
}
layer {
  name: "data"
  type: "HDF5Data"
  top: "cont_sentence"
  top: "input_sentence"
  top: "target_sentence"
  top: "stage_indicator"
  top: "frame_fc7"
  include {
    phase: TEST
    stage: "test-on-train"
  }
  hdf5_data_param {
    source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
    batch_size: 80
  }
}
layer {
  name: "data"
  type: "HDF5Data"
  top: "cont_sentence"
  top: "input_sentence"
  top: "target_sentence"
  top: "stage_indicator"
  top: "frame_fc7"
  include {
    phase: TEST
    stage: "test-on-val"
  }
  hdf5_data_param {
    source: "./hdf5/buffer_32_s2vt_80/valid_batches/hdf5_chunk_list.txt"
    batch_size: 80
  }
}
layer {
  name: "reshape_stg_indicator" # from 80 32 to 80 32 1 to concat
  type: "Reshape"
  bottom: "stage_indicator"
  top: "stage_indicator_3axis"
  reshape_param {
    shape {
      dim: 80
      dim: 32
      dim: 1
    }
  }
}
layer {
  name: "dropFc7"
  type: "Dropout"
  bottom: "frame_fc7"
  top: "frame_fc7"
  dropout_param { dropout_ratio: 0.5 }
  include { stage: "dropFc7" }
}
layer {
  name: "embed_encoder"
  type: "InnerProduct"
  bottom: "frame_fc7"
  top: "embedded_input_frames"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 500
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
      value: 0.2
    }
    axis: -1
  }
}
layer {
  name: "embedding"
  type: "Embed"
  bottom: "input_sentence"
  top: "embedded_input_sentence"
  param {
    lr_mult: 1
  }
  embed_param {
    bias_term: false
    input_dim: 46168 #youtube_movie_vocab+1
    num_output: 500
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
  }
}
layer {
  name: "drop_input_en"
  type: "Dropout"
  bottom: "embedded_input_sentence"
  top: "embedded_input_sentence"
  dropout_param { dropout_ratio: 0.5 }
  include { stage: "dropEn" }
}
# unfactored model concat frames and sents
layer {
  name: "concat"
  type: "Concat"
  concat_param { concat_dim: 2 } # concat along h
  bottom: "embedded_input_frames"
  bottom: "embedded_input_sentence"
  bottom: "stage_indicator_3axis"
  top: "embedded_input_video_sequence"
  include { stage: "unfactored" }
}
layer {
  name: "lstm1"
  type: "LSTM"
  bottom: "embedded_input_video_sequence"
  bottom: "cont_sentence"
  top: "lstm1"
  include { stage: "unfactored" }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "lstm2"
  type: "LSTM"
  bottom: "lstm1"
  bottom: "cont_sentence"
  top: "lstm2"
  include {
    stage: "unfactored"
    stage: "2-layer"
  }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "lstm1"
  type: "LSTM"
  bottom: "embedded_input_frames"
  bottom: "cont_sentence"
  top: "lstm1"
  include { stage: "factored" }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "drop_lstm1"
  type: "Dropout"
  bottom: "lstm1"
  top: "lstm1"
  dropout_param { dropout_ratio: 0.5 }
  include { stage: "dropLstm1" }
}
layer {
  name: "concat"
  type: "Concat"
  concat_param { concat_dim: 2 } # concat along h
  bottom: "lstm1"
  bottom: "embedded_input_sentence"
  bottom: "stage_indicator_3axis"
  top: "lstm1_video_sequence"
  include { stage: "factored" }
}
layer {
  name: "lstm2"
  type: "LSTM"
  bottom: "lstm1_video_sequence"
  bottom: "cont_sentence"
  top: "lstm2"
  include { stage: "factored" }
  recurrent_param {
    num_output: 1000
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "drop_lstm2"
  type: "Dropout"
  bottom: "lstm2"
  top: "lstm2"
  dropout_param { dropout_ratio: 0.5 }
  include { stage: "dropLstm2" }
}
layer {
  name: "predict"
  type: "InnerProduct"
  bottom: "lstm1"
  top: "predict"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  exclude { stage: "2-layer" }
  inner_product_param {
    num_output: 46168 # youtube_movie_vocab + 1
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
    axis: 2
  }
}
layer {
  name: "predict"
  type: "InnerProduct"
  bottom: "lstm2"
  top: "predict"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  include { stage: "2-layer" }
  inner_product_param {
    num_output: 46168 # youtube_movie_vocab + 1
    weight_filler {
      type: "uniform"
      min: -0.08
      max: 0.08
    }
    bias_filler {
      type: "constant"
      value: 0
    }
    axis: 2
  }
}
layer {
  name: "cross_entropy_loss"
  type: "SoftmaxWithLoss"
  bottom: "predict"
  bottom: "target_sentence"
  top: "cross_entropy_loss"
  loss_weight: 20
  loss_param {
    ignore_label: -1
  }
  softmax_param {
    axis: 2
  }
}
layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "predict"
  bottom: "target_sentence"
  top: "accuracy"
  include { phase: TEST }
  accuracy_param {
    axis: 2
    ignore_label: -1
  }
}
	# The network is used for the video description experiments of S2VT [1].
	# Please consider citing S2VT [1] if you use this example in your work.
	#
	# [1] S. Venugopalan, M. Rohrbach, J. Donahue, R. Mooney, T. Darrell,
	# K. Saenko. "Sequence to Sequence - Video to Text." ICCV 2015.

	# The data is prepared using framefc7_stream_text_to_hdf5.py
	# It is in (32) parallel streams.
	name: "s2vt"
	layer {
	name: "data"
	type: "HDF5Data"
	top: "cont_sentence"
	top: "input_sentence"
	top: "target_sentence"
	top: "stage_indicator"
	top: "frame_fc7"
	include { phase: TRAIN }
	hdf5_data_param {
	source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
	batch_size: 80
	}
	}
	layer {
	name: "data"
	type: "HDF5Data"
	top: "cont_sentence"
	top: "input_sentence"
	top: "target_sentence"
	top: "stage_indicator"
	top: "frame_fc7"
	include {
	phase: TEST
	stage: "test-on-train"
	}
	hdf5_data_param {
	source: "./hdf5/buffer_32_s2vt_80/train_batches/hdf5_chunk_list.txt"
	batch_size: 80
	}
	}
	layer {
	name: "data"
	type: "HDF5Data"
	top: "cont_sentence"
	top: "input_sentence"
	top: "target_sentence"
	top: "stage_indicator"
	top: "frame_fc7"
	include {
	phase: TEST
	stage: "test-on-val"
	}
	hdf5_data_param {
	source: "./hdf5/buffer_32_s2vt_80/valid_batches/hdf5_chunk_list.txt"
	batch_size: 80
	}
	}
	layer {
	name: "reshape_stg_indicator" # from 80 32 to 80 32 1 to concat
	type: "Reshape"
	bottom: "stage_indicator"
	top: "stage_indicator_3axis"
	reshape_param {
	shape {
	dim: 80
	dim: 32
	dim: 1
	}
	}
	}
	layer {
	name: "dropFc7"
	type: "Dropout"
	bottom: "frame_fc7"
	top: "frame_fc7"
	dropout_param { dropout_ratio: 0.5 }
	include { stage: "dropFc7" }
	}
	layer {
	name: "embed_encoder"
	type: "InnerProduct"
	bottom: "frame_fc7"
	top: "embedded_input_frames"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	inner_product_param {
	num_output: 500
	weight_filler {
	type: "xavier"
	}
	bias_filler {
	type: "constant"
	value: 0.2
	}
	axis: -1
	}
	}
	layer {
	name: "embedding"
	type: "Embed"
	bottom: "input_sentence"
	top: "embedded_input_sentence"
	param {
	lr_mult: 1
	}
	embed_param {
	bias_term: false
	input_dim: 46168 #youtube_movie_vocab+1
	num_output: 500
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	}
	}
	layer {
	name: "drop_input_en"
	type: "Dropout"
	bottom: "embedded_input_sentence"
	top: "embedded_input_sentence"
	dropout_param { dropout_ratio: 0.5 }
	include { stage: "dropEn" }
	}
	# unfactored model concat frames and sents
	layer {
	name: "concat"
	type: "Concat"
	concat_param { concat_dim: 2 } # concat along h
	bottom: "embedded_input_frames"
	bottom: "embedded_input_sentence"
	bottom: "stage_indicator_3axis"
	top: "embedded_input_video_sequence"
	include { stage: "unfactored" }
	}
	layer {
	name: "lstm1"
	type: "LSTM"
	bottom: "embedded_input_video_sequence"
	bottom: "cont_sentence"
	top: "lstm1"
	include { stage: "unfactored" }
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "lstm2"
	type: "LSTM"
	bottom: "lstm1"
	bottom: "cont_sentence"
	top: "lstm2"
	include {
	stage: "unfactored"
	stage: "2-layer"
	}
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "lstm1"
	type: "LSTM"
	bottom: "embedded_input_frames"
	bottom: "cont_sentence"
	top: "lstm1"
	include { stage: "factored" }
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "drop_lstm1"
	type: "Dropout"
	bottom: "lstm1"
	top: "lstm1"
	dropout_param { dropout_ratio: 0.5 }
	include { stage: "dropLstm1" }
	}
	layer {
	name: "concat"
	type: "Concat"
	concat_param { concat_dim: 2 } # concat along h
	bottom: "lstm1"
	bottom: "embedded_input_sentence"
	bottom: "stage_indicator_3axis"
	top: "lstm1_video_sequence"
	include { stage: "factored" }
	}
	layer {
	name: "lstm2"
	type: "LSTM"
	bottom: "lstm1_video_sequence"
	bottom: "cont_sentence"
	top: "lstm2"
	include { stage: "factored" }
	recurrent_param {
	num_output: 1000
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	}
	}
	layer {
	name: "drop_lstm2"
	type: "Dropout"
	bottom: "lstm2"
	top: "lstm2"
	dropout_param { dropout_ratio: 0.5 }
	include { stage: "dropLstm2" }
	}
	layer {
	name: "predict"
	type: "InnerProduct"
	bottom: "lstm1"
	top: "predict"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	exclude { stage: "2-layer" }
	inner_product_param {
	num_output: 46168 # youtube_movie_vocab + 1
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	axis: 2
	}
	}
	layer {
	name: "predict"
	type: "InnerProduct"
	bottom: "lstm2"
	top: "predict"
	param {
	lr_mult: 1
	decay_mult: 1
	}
	param {
	lr_mult: 2
	decay_mult: 0
	}
	include { stage: "2-layer" }
	inner_product_param {
	num_output: 46168 # youtube_movie_vocab + 1
	weight_filler {
	type: "uniform"
	min: -0.08
	max: 0.08
	}
	bias_filler {
	type: "constant"
	value: 0
	}
	axis: 2
	}
	}
	layer {
	name: "cross_entropy_loss"
	type: "SoftmaxWithLoss"
	bottom: "predict"
	bottom: "target_sentence"
	top: "cross_entropy_loss"
	loss_weight: 20
	loss_param {
	ignore_label: -1
	}
	softmax_param {
	axis: 2
	}
	}
	layer {
	name: "accuracy"
	type: "Accuracy"
	bottom: "predict"
	bottom: "target_sentence"
	top: "accuracy"
	include { phase: TEST }
	accuracy_param {
	axis: 2
	ignore_label: -1
	}
	}