wobu/Dl4j.scala

## README
- copy all files in a directory
- execute testdata.sh
- "sbt run"
- compare results

## build.sbt
name := "InferVectorComparison"

scalaVersion := "2.11.8"

libraryDependencies += "org.nd4j" % "nd4j-native-platform" % "0.8.0"
libraryDependencies += "org.deeplearning4j" % "deeplearning4j-core" % "0.8.0"
libraryDependencies += "org.deeplearning4j" % "deeplearning4j-nlp" % "0.8.0"
libraryDependencies += "com.github.pathikrit" %% "better-files" % "2.17.1"
libraryDependencies += "ch.qos.logback" % "logback-classic" % "1.2.2"

## Dl4j.scala
import better.files.File
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer
import org.deeplearning4j.models.paragraphvectors.ParagraphVectors
import org.deeplearning4j.models.sequencevectors.interfaces.SequenceIterator
import org.deeplearning4j.models.sequencevectors.sequence.Sequence
import org.deeplearning4j.models.word2vec.VocabWord
import org.nd4j.linalg.ops.transforms.Transforms

object Doc2VecIMDB extends App {
  val vec = new ParagraphVectors.Builder()
    .seed(42)
    .minWordFrequency(2)
    .iterations(20)
    //    .epochs(20)
    .layerSize(100)
    //      .learningRate(0.025)
    //    .windowSize(5)
    //      .useHierarchicSoftmax(true)
    //      .sampling(1e-5) // always use!
    .negativeSample(5)
    .workers(12)
    .trainElementsRepresentation(false)
    //    .tokenizerFactory(t)
    //      .trainElementsRepresentation(false)
    //      .sequenceLearningAlgorithm(new DM[VocabWord]())
    //    .sequenceLearningAlgorithm(new DBOW[VocabWord]())
    //      .elementsLearningAlgorithm(new SkipGram[VocabWord]())
    //      .useExistingWordVectors(WordVectorSerializer.readWord2VecModel(jobConfig.w2vFile, true))
    .iterate(new TestIterator(() => {
    scala.util.Random.shuffle(File("""/home/wbuchner/doc2vec/imdb/alldata-id.txt""").lines.toList).toIterator
  }))
    .build()


  class TestIterator(val underlyingIteratorFactory: () => Iterator[String]) extends SequenceIterator[VocabWord] {

    var underlyingIterator: Iterator[String] = underlyingIteratorFactory()

    def hasMoreSequences: Boolean = underlyingIterator.hasNext

    def reset(): Unit = underlyingIterator = underlyingIteratorFactory()

    def nextSequence(): Sequence[VocabWord] = {
      val result = new Sequence[VocabWord]()

      val line = underlyingIterator.next()

      val tokens = line.split(" ")

      result.addSequenceLabel(new VocabWord(1.0, tokens.head))

      tokens.tail.filter(_.nonEmpty).foreach(s => result.addElement(new VocabWord(1.0, s)))

      result
    }
  }

  vec.fit()

  WordVectorSerializer.writeParagraphVectors(vec, File("""/home/wbuchner/doc2vec/imdb/imdb_doc2vec.zip""").toJava)

  def tokenize(text: String): List[VocabWord] = text.split(" ").filter(_.nonEmpty).map(new VocabWord(1.0, _)).toList

  import scala.collection.JavaConverters._

  val text = """first ,  the positives :  an excellent job at depicting urban landscapes to suit the mood of the film .  some of the shots could be paintings by de chirico .  sophie marceau ,  beautiful .   the negatives :  the stories are hard to believe .  unreal ,  uni-dimensional characters preen and posture 100% of the time ,  as if they were in some kind of catwalk .  this is neither the antonioni of his earlier ,  much better movies nor the wenders we've all come to know and appreciate .  malkovich is excess baggage in this movie ."""

  val pretrainedVector = vec.getWordVectorMatrix("_*25430")
  val inferredVector = vec.inferVector(tokenize(text).asJava)
  val sim = Transforms.cosineSim(pretrainedVector, inferredVector)

  println(s"25430 <-> inferred = $sim")
  println(s"Gensim DBOW: 25430 <-> inferred = 0.93")
}

## Gensim Doc2vec IMDB Sample
https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

## testdata.sh
function normalize_text {
  awk '{print tolower($0);}' < $1 | sed -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/"/ " /g' \
  -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' -e 's/\?/ \? /g' \
  -e 's/\;/ \; /g' -e 's/\:/ \: /g' > $1-norm
}

wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
tar -xvf aclImdb_v1.tar.gz

for j in train/pos train/neg test/pos test/neg train/unsup; do
  rm temp
  for i in `ls aclImdb/$j`; do cat aclImdb/$j/$i >> temp; awk 'BEGIN{print;}' >> temp; done
  normalize_text temp
  mv temp-norm aclImdb/$j/norm.txt
done
mv aclImdb/train/pos/norm.txt train-pos.txt
mv aclImdb/train/neg/norm.txt train-neg.txt
mv aclImdb/test/pos/norm.txt test-pos.txt
mv aclImdb/test/neg/norm.txt test-neg.txt
mv aclImdb/train/unsup/norm.txt train-unsup.txt

cat train-pos.txt train-neg.txt test-pos.txt test-neg.txt train-unsup.txt > alldata.txt
awk 'BEGIN{a=0;}{print "_*" a " " $0; a++;}' < alldata.txt > alldata-id.txt
	- copy all files in a directory
	- execute testdata.sh
	- "sbt run"
	- compare results
	name := "InferVectorComparison"

	scalaVersion := "2.11.8"

	libraryDependencies += "org.nd4j" % "nd4j-native-platform" % "0.8.0"
	libraryDependencies += "org.deeplearning4j" % "deeplearning4j-core" % "0.8.0"
	libraryDependencies += "org.deeplearning4j" % "deeplearning4j-nlp" % "0.8.0"
	libraryDependencies += "com.github.pathikrit" %% "better-files" % "2.17.1"
	libraryDependencies += "ch.qos.logback" % "logback-classic" % "1.2.2"
	import better.files.File
	import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer
	import org.deeplearning4j.models.paragraphvectors.ParagraphVectors
	import org.deeplearning4j.models.sequencevectors.interfaces.SequenceIterator
	import org.deeplearning4j.models.sequencevectors.sequence.Sequence
	import org.deeplearning4j.models.word2vec.VocabWord
	import org.nd4j.linalg.ops.transforms.Transforms

	object Doc2VecIMDB extends App {
	val vec = new ParagraphVectors.Builder()
	.seed(42)
	.minWordFrequency(2)
	.iterations(20)
	// .epochs(20)
	.layerSize(100)
	// .learningRate(0.025)
	// .windowSize(5)
	// .useHierarchicSoftmax(true)
	// .sampling(1e-5) // always use!
	.negativeSample(5)
	.workers(12)
	.trainElementsRepresentation(false)
	// .tokenizerFactory(t)
	// .trainElementsRepresentation(false)
	// .sequenceLearningAlgorithm(new DM[VocabWord]())
	// .sequenceLearningAlgorithm(new DBOW[VocabWord]())
	// .elementsLearningAlgorithm(new SkipGram[VocabWord]())
	// .useExistingWordVectors(WordVectorSerializer.readWord2VecModel(jobConfig.w2vFile, true))
	.iterate(new TestIterator(() => {
	scala.util.Random.shuffle(File("""/home/wbuchner/doc2vec/imdb/alldata-id.txt""").lines.toList).toIterator
	}))
	.build()


	class TestIterator(val underlyingIteratorFactory: () => Iterator[String]) extends SequenceIterator[VocabWord] {

	var underlyingIterator: Iterator[String] = underlyingIteratorFactory()

	def hasMoreSequences: Boolean = underlyingIterator.hasNext

	def reset(): Unit = underlyingIterator = underlyingIteratorFactory()

	def nextSequence(): Sequence[VocabWord] = {
	val result = new Sequence[VocabWord]()

	val line = underlyingIterator.next()

	val tokens = line.split(" ")

	result.addSequenceLabel(new VocabWord(1.0, tokens.head))

	tokens.tail.filter(_.nonEmpty).foreach(s => result.addElement(new VocabWord(1.0, s)))

	result
	}
	}

	vec.fit()

	WordVectorSerializer.writeParagraphVectors(vec, File("""/home/wbuchner/doc2vec/imdb/imdb_doc2vec.zip""").toJava)

	def tokenize(text: String): List[VocabWord] = text.split(" ").filter(_.nonEmpty).map(new VocabWord(1.0, _)).toList

	import scala.collection.JavaConverters._

	val text = """first , the positives : an excellent job at depicting urban landscapes to suit the mood of the film . some of the shots could be paintings by de chirico . sophie marceau , beautiful . the negatives : the stories are hard to believe . unreal , uni-dimensional characters preen and posture 100% of the time , as if they were in some kind of catwalk . this is neither the antonioni of his earlier , much better movies nor the wenders we've all come to know and appreciate . malkovich is excess baggage in this movie ."""

	val pretrainedVector = vec.getWordVectorMatrix("_*25430")
	val inferredVector = vec.inferVector(tokenize(text).asJava)
	val sim = Transforms.cosineSim(pretrainedVector, inferredVector)

	println(s"25430 <-> inferred = $sim")
	println(s"Gensim DBOW: 25430 <-> inferred = 0.93")
	}
	function normalize_text {
	awk '{print tolower($0);}' < $1 \| sed -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/"/ " /g' \
	-e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' -e 's/\?/ \? /g' \
	-e 's/\;/ \; /g' -e 's/\:/ \: /g' > $1-norm
	}

	wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
	tar -xvf aclImdb_v1.tar.gz

	for j in train/pos train/neg test/pos test/neg train/unsup; do
	rm temp
	for i in `ls aclImdb/$j`; do cat aclImdb/$j/$i >> temp; awk 'BEGIN{print;}' >> temp; done
	normalize_text temp
	mv temp-norm aclImdb/$j/norm.txt
	done
	mv aclImdb/train/pos/norm.txt train-pos.txt
	mv aclImdb/train/neg/norm.txt train-neg.txt
	mv aclImdb/test/pos/norm.txt test-pos.txt
	mv aclImdb/test/neg/norm.txt test-neg.txt
	mv aclImdb/train/unsup/norm.txt train-unsup.txt

	cat train-pos.txt train-neg.txt test-pos.txt test-neg.txt train-unsup.txt > alldata.txt
	awk 'BEGIN{a=0;}{print "_*" a " " $0; a++;}' < alldata.txt > alldata-id.txt