bikashg/gist:132bd63011636d07719fbd6fd97ef534

## gistfile1.txt
ComputationGraphConfiguration configuration = new NeuralNetConfiguration.Builder()
                .weightInit(WeightInit.XAVIER)
                .learningRate(0.25)
                .updater(Updater.RMSPROP)
                .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1) // Nb. of iterations = 1
                .seed(123)
                .graphBuilder()
                //These are the names of the two inputs to the computation graph.
                .addInputs("nlPhrase", "owlAxiom")
                .setInputTypes(InputType.recurrent(vocab.size()), InputType.recurrent(vocab.size())) // The size of the vocab.
                //The inputs to the encoder will have size = minibatch x featuresize x timesteps
                //Note that the network only knows of the feature vector size. It does not know how many time steps unless it sees an instance of the data
                .addLayer("encoder", new GravesLSTM.Builder().nIn(vocab.size()).nOut(128).activation(Activation.SOFTSIGN).build(),"nlPhrase") // For now, the LSTM cell in the encoder has 128 nodes; input timesteps will contain as many as vocab.size() elements.
                .addLayer("decoder", new GravesLSTM.Builder().nIn(vocab.size()).nOut(128).activation(Activation.SOFTSIGN).build(), "owlAxiom") // For now, the LSTM cell in the decoder also has 128 nodes; input timesteps will contain as many as vocab.size() nodes.
                // nIn for RnnOutputLayer is the number of hidden layers in the decoder's last LSTM cell
                .addLayer("output", new RnnOutputLayer.Builder().nIn(128).nOut(vocab.size()).activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build(), "decoder")
                .setOutputs("output")
                .pretrain(false).backprop(true)
                .build();

        ComputationGraph net = new ComputationGraph(configuration);
	ComputationGraphConfiguration configuration = new NeuralNetConfiguration.Builder()
	.weightInit(WeightInit.XAVIER)
	.learningRate(0.25)
	.updater(Updater.RMSPROP)
	.optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1) // Nb. of iterations = 1
	.seed(123)
	.graphBuilder()
	//These are the names of the two inputs to the computation graph.
	.addInputs("nlPhrase", "owlAxiom")
	.setInputTypes(InputType.recurrent(vocab.size()), InputType.recurrent(vocab.size())) // The size of the vocab.
	//The inputs to the encoder will have size = minibatch x featuresize x timesteps
	//Note that the network only knows of the feature vector size. It does not know how many time steps unless it sees an instance of the data
	.addLayer("encoder", new GravesLSTM.Builder().nIn(vocab.size()).nOut(128).activation(Activation.SOFTSIGN).build(),"nlPhrase") // For now, the LSTM cell in the encoder has 128 nodes; input timesteps will contain as many as vocab.size() elements.
	.addLayer("decoder", new GravesLSTM.Builder().nIn(vocab.size()).nOut(128).activation(Activation.SOFTSIGN).build(), "owlAxiom") // For now, the LSTM cell in the decoder also has 128 nodes; input timesteps will contain as many as vocab.size() nodes.
	// nIn for RnnOutputLayer is the number of hidden layers in the decoder's last LSTM cell
	.addLayer("output", new RnnOutputLayer.Builder().nIn(128).nOut(vocab.size()).activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build(), "decoder")
	.setOutputs("output")
	.pretrain(false).backprop(true)
	.build();

	ComputationGraph net = new ComputationGraph(configuration);