Skip to content

Instantly share code, notes, and snippets.

@wobu
Last active March 20, 2017 10:13
Show Gist options
  • Save wobu/8d996928e71afca7e4a1c117d7fc98a8 to your computer and use it in GitHub Desktop.
Save wobu/8d996928e71afca7e4a1c117d7fc98a8 to your computer and use it in GitHub Desktop.
Dl4j - gensim: Comparing Inferring of vectors
- copy all files in a directory
- execute testdata.sh
- "sbt run"
- compare results
name := "InferVectorComparison"
scalaVersion := "2.11.8"
libraryDependencies += "org.nd4j" % "nd4j-native-platform" % "0.8.0"
libraryDependencies += "org.deeplearning4j" % "deeplearning4j-core" % "0.8.0"
libraryDependencies += "org.deeplearning4j" % "deeplearning4j-nlp" % "0.8.0"
libraryDependencies += "com.github.pathikrit" %% "better-files" % "2.17.1"
libraryDependencies += "ch.qos.logback" % "logback-classic" % "1.2.2"
import better.files.File
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer
import org.deeplearning4j.models.paragraphvectors.ParagraphVectors
import org.deeplearning4j.models.sequencevectors.interfaces.SequenceIterator
import org.deeplearning4j.models.sequencevectors.sequence.Sequence
import org.deeplearning4j.models.word2vec.VocabWord
import org.nd4j.linalg.ops.transforms.Transforms
object Doc2VecIMDB extends App {
val vec = new ParagraphVectors.Builder()
.seed(42)
.minWordFrequency(2)
.iterations(20)
// .epochs(20)
.layerSize(100)
// .learningRate(0.025)
// .windowSize(5)
// .useHierarchicSoftmax(true)
// .sampling(1e-5) // always use!
.negativeSample(5)
.workers(12)
.trainElementsRepresentation(false)
// .tokenizerFactory(t)
// .trainElementsRepresentation(false)
// .sequenceLearningAlgorithm(new DM[VocabWord]())
// .sequenceLearningAlgorithm(new DBOW[VocabWord]())
// .elementsLearningAlgorithm(new SkipGram[VocabWord]())
// .useExistingWordVectors(WordVectorSerializer.readWord2VecModel(jobConfig.w2vFile, true))
.iterate(new TestIterator(() => {
scala.util.Random.shuffle(File("""/home/wbuchner/doc2vec/imdb/alldata-id.txt""").lines.toList).toIterator
}))
.build()
class TestIterator(val underlyingIteratorFactory: () => Iterator[String]) extends SequenceIterator[VocabWord] {
var underlyingIterator: Iterator[String] = underlyingIteratorFactory()
def hasMoreSequences: Boolean = underlyingIterator.hasNext
def reset(): Unit = underlyingIterator = underlyingIteratorFactory()
def nextSequence(): Sequence[VocabWord] = {
val result = new Sequence[VocabWord]()
val line = underlyingIterator.next()
val tokens = line.split(" ")
result.addSequenceLabel(new VocabWord(1.0, tokens.head))
tokens.tail.filter(_.nonEmpty).foreach(s => result.addElement(new VocabWord(1.0, s)))
result
}
}
vec.fit()
WordVectorSerializer.writeParagraphVectors(vec, File("""/home/wbuchner/doc2vec/imdb/imdb_doc2vec.zip""").toJava)
def tokenize(text: String): List[VocabWord] = text.split(" ").filter(_.nonEmpty).map(new VocabWord(1.0, _)).toList
import scala.collection.JavaConverters._
val text = """first , the positives : an excellent job at depicting urban landscapes to suit the mood of the film . some of the shots could be paintings by de chirico . sophie marceau , beautiful . the negatives : the stories are hard to believe . unreal , uni-dimensional characters preen and posture 100% of the time , as if they were in some kind of catwalk . this is neither the antonioni of his earlier , much better movies nor the wenders we've all come to know and appreciate . malkovich is excess baggage in this movie ."""
val pretrainedVector = vec.getWordVectorMatrix("_*25430")
val inferredVector = vec.inferVector(tokenize(text).asJava)
val sim = Transforms.cosineSim(pretrainedVector, inferredVector)
println(s"25430 <-> inferred = $sim")
println(s"Gensim DBOW: 25430 <-> inferred = 0.93")
}
https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
function normalize_text {
awk '{print tolower($0);}' < $1 | sed -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/"/ " /g' \
-e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' -e 's/\?/ \? /g' \
-e 's/\;/ \; /g' -e 's/\:/ \: /g' > $1-norm
}
wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
tar -xvf aclImdb_v1.tar.gz
for j in train/pos train/neg test/pos test/neg train/unsup; do
rm temp
for i in `ls aclImdb/$j`; do cat aclImdb/$j/$i >> temp; awk 'BEGIN{print;}' >> temp; done
normalize_text temp
mv temp-norm aclImdb/$j/norm.txt
done
mv aclImdb/train/pos/norm.txt train-pos.txt
mv aclImdb/train/neg/norm.txt train-neg.txt
mv aclImdb/test/pos/norm.txt test-pos.txt
mv aclImdb/test/neg/norm.txt test-neg.txt
mv aclImdb/train/unsup/norm.txt train-unsup.txt
cat train-pos.txt train-neg.txt test-pos.txt test-neg.txt train-unsup.txt > alldata.txt
awk 'BEGIN{a=0;}{print "_*" a " " $0; a++;}' < alldata.txt > alldata-id.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment