Last active
March 20, 2017 10:13
-
-
Save wobu/8d996928e71afca7e4a1c117d7fc98a8 to your computer and use it in GitHub Desktop.
Dl4j - gensim: Comparing Inferring of vectors
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
- copy all files in a directory | |
- execute testdata.sh | |
- "sbt run" | |
- compare results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name := "InferVectorComparison" | |
scalaVersion := "2.11.8" | |
libraryDependencies += "org.nd4j" % "nd4j-native-platform" % "0.8.0" | |
libraryDependencies += "org.deeplearning4j" % "deeplearning4j-core" % "0.8.0" | |
libraryDependencies += "org.deeplearning4j" % "deeplearning4j-nlp" % "0.8.0" | |
libraryDependencies += "com.github.pathikrit" %% "better-files" % "2.17.1" | |
libraryDependencies += "ch.qos.logback" % "logback-classic" % "1.2.2" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import better.files.File | |
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer | |
import org.deeplearning4j.models.paragraphvectors.ParagraphVectors | |
import org.deeplearning4j.models.sequencevectors.interfaces.SequenceIterator | |
import org.deeplearning4j.models.sequencevectors.sequence.Sequence | |
import org.deeplearning4j.models.word2vec.VocabWord | |
import org.nd4j.linalg.ops.transforms.Transforms | |
object Doc2VecIMDB extends App { | |
val vec = new ParagraphVectors.Builder() | |
.seed(42) | |
.minWordFrequency(2) | |
.iterations(20) | |
// .epochs(20) | |
.layerSize(100) | |
// .learningRate(0.025) | |
// .windowSize(5) | |
// .useHierarchicSoftmax(true) | |
// .sampling(1e-5) // always use! | |
.negativeSample(5) | |
.workers(12) | |
.trainElementsRepresentation(false) | |
// .tokenizerFactory(t) | |
// .trainElementsRepresentation(false) | |
// .sequenceLearningAlgorithm(new DM[VocabWord]()) | |
// .sequenceLearningAlgorithm(new DBOW[VocabWord]()) | |
// .elementsLearningAlgorithm(new SkipGram[VocabWord]()) | |
// .useExistingWordVectors(WordVectorSerializer.readWord2VecModel(jobConfig.w2vFile, true)) | |
.iterate(new TestIterator(() => { | |
scala.util.Random.shuffle(File("""/home/wbuchner/doc2vec/imdb/alldata-id.txt""").lines.toList).toIterator | |
})) | |
.build() | |
class TestIterator(val underlyingIteratorFactory: () => Iterator[String]) extends SequenceIterator[VocabWord] { | |
var underlyingIterator: Iterator[String] = underlyingIteratorFactory() | |
def hasMoreSequences: Boolean = underlyingIterator.hasNext | |
def reset(): Unit = underlyingIterator = underlyingIteratorFactory() | |
def nextSequence(): Sequence[VocabWord] = { | |
val result = new Sequence[VocabWord]() | |
val line = underlyingIterator.next() | |
val tokens = line.split(" ") | |
result.addSequenceLabel(new VocabWord(1.0, tokens.head)) | |
tokens.tail.filter(_.nonEmpty).foreach(s => result.addElement(new VocabWord(1.0, s))) | |
result | |
} | |
} | |
vec.fit() | |
WordVectorSerializer.writeParagraphVectors(vec, File("""/home/wbuchner/doc2vec/imdb/imdb_doc2vec.zip""").toJava) | |
def tokenize(text: String): List[VocabWord] = text.split(" ").filter(_.nonEmpty).map(new VocabWord(1.0, _)).toList | |
import scala.collection.JavaConverters._ | |
val text = """first , the positives : an excellent job at depicting urban landscapes to suit the mood of the film . some of the shots could be paintings by de chirico . sophie marceau , beautiful . the negatives : the stories are hard to believe . unreal , uni-dimensional characters preen and posture 100% of the time , as if they were in some kind of catwalk . this is neither the antonioni of his earlier , much better movies nor the wenders we've all come to know and appreciate . malkovich is excess baggage in this movie .""" | |
val pretrainedVector = vec.getWordVectorMatrix("_*25430") | |
val inferredVector = vec.inferVector(tokenize(text).asJava) | |
val sim = Transforms.cosineSim(pretrainedVector, inferredVector) | |
println(s"25430 <-> inferred = $sim") | |
println(s"Gensim DBOW: 25430 <-> inferred = 0.93") | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function normalize_text { | |
awk '{print tolower($0);}' < $1 | sed -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/"/ " /g' \ | |
-e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' -e 's/\?/ \? /g' \ | |
-e 's/\;/ \; /g' -e 's/\:/ \: /g' > $1-norm | |
} | |
wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz | |
tar -xvf aclImdb_v1.tar.gz | |
for j in train/pos train/neg test/pos test/neg train/unsup; do | |
rm temp | |
for i in `ls aclImdb/$j`; do cat aclImdb/$j/$i >> temp; awk 'BEGIN{print;}' >> temp; done | |
normalize_text temp | |
mv temp-norm aclImdb/$j/norm.txt | |
done | |
mv aclImdb/train/pos/norm.txt train-pos.txt | |
mv aclImdb/train/neg/norm.txt train-neg.txt | |
mv aclImdb/test/pos/norm.txt test-pos.txt | |
mv aclImdb/test/neg/norm.txt test-neg.txt | |
mv aclImdb/train/unsup/norm.txt train-unsup.txt | |
cat train-pos.txt train-neg.txt test-pos.txt test-neg.txt train-unsup.txt > alldata.txt | |
awk 'BEGIN{a=0;}{print "_*" a " " $0; a++;}' < alldata.txt > alldata-id.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment