-
-
Save JonyD/c514a856e6af4c9b0816dc44dde6fab9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Code excerpts | |
final SentenceIterator it1 = loadData(fileName); | |
final TokenizerFactory tf = tokenizeData(); | |
final int layerSize = 100; | |
final long seed = 5; | |
final int windowSize = 50; | |
final int minWordFrequency = 2; | |
final Word2Vec vec = trainModel(it1, tf, layerSize, seed, windowSize, minWordFrequency); | |
////////////////////////////////////////////////////////////////////////////////////////////////// | |
SentenceIterator loadData(final String fileName) { | |
SentenceIterator sentenceIterator = null; | |
try { | |
final String filePath = new ClassPathResource(fileName).getFile().getAbsolutePath(); | |
// strip white space before & after | |
sentenceIterator = new BasicLineIterator(filePath); | |
} catch (final IOException e1) { | |
logger.error(e1.getMessage()); | |
e1.printStackTrace(); | |
} | |
return sentenceIterator; | |
} | |
////////////////////////////////////////////////////////////////////////////////////////////////// | |
TokenizerFactory tokenizeData() { | |
// Split on white spaces in the line to get words | |
final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory(); | |
tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor()); | |
return tokenizerFactory; | |
} | |
////////////////////////////////////////////////////////////////////////////////////////////////// | |
Word2Vec trainModel( final SentenceIterator iterator, | |
final TokenizerFactory tokenizerFactory, final int layerSize, final long seed, | |
final int windowSize, final int minWordFrequency) { | |
final Word2Vec vec = | |
new Word2Vec.Builder().minWordFrequency(minWordFrequency).layerSize(layerSize).seed(seed) | |
.windowSize(windowSize).iterate(iterator).tokenizerFactory(tokenizerFactory).build(); | |
vec.fit(); | |
return vec; | |
} | |
////////////////////////////////////////////////////////////////////////////////////////////////// | |
TEST RESULT (EXCEPTION) | |
12:21:31.770 [main] INFO com.yoochoose.ml.word2vec.Word2VecRaw - Load & Vectorize Sentences | |
12:21:31.773 [main] INFO com.yoochoose.ml.word2vec.Word2VecRaw - Tokenizing data... | |
12:21:31.774 [main] INFO com.yoochoose.ml.word2vec.Word2VecRaw - Building model... | |
12:21:31.811 [main] INFO org.nd4j.linalg.factory.Nd4jBackend - Loaded [CpuBackend] backend | |
12:21:32.455 [main] INFO org.nd4j.nativeblas.NativeOpsHolder - Number of threads used for NativeOps: 4 | |
12:21:32.589 [main] INFO org.nd4j.nativeblas.Nd4jBlas - Number of threads used for BLAS: 4 | |
12:21:32.593 [main] INFO org.nd4j.linalg.api.ops.executioner.DefaultOpExecutioner - Backend used: [CPU]; OS: [Linux] | |
12:21:32.593 [main] INFO org.nd4j.linalg.api.ops.executioner.DefaultOpExecutioner - Cores: [4]; Memory: [0.9GB]; | |
12:21:32.593 [main] INFO org.nd4j.linalg.api.ops.executioner.DefaultOpExecutioner - Blas vendor: [OPENBLAS] | |
12:21:32.763 [main] INFO com.yoochoose.ml.word2vec.Word2VecRaw - Fitting Word2Vec model... | |
12:21:32.780 [main] INFO org.deeplearning4j.models.sequencevectors.SequenceVectors - Starting vocabulary building... | |
12:21:32.780 [main] DEBUG org.deeplearning4j.models.word2vec.wordstore.VocabConstructor - Target vocab size before building: [0] | |
12:21:32.843 [main] DEBUG org.deeplearning4j.models.word2vec.wordstore.VocabConstructor - Trying source iterator: [0] | |
12:21:32.843 [main] DEBUG org.deeplearning4j.models.word2vec.wordstore.VocabConstructor - Target vocab size before building: [0] | |
12:21:35.983 [main] DEBUG org.deeplearning4j.models.word2vec.wordstore.VocabConstructor - Waiting till all processes stop... | |
12:21:35.985 [main] DEBUG org.deeplearning4j.models.word2vec.wordstore.VocabConstructor - Vocab size before truncation: [0], NumWords: [0], sequences parsed: [10000], counter: [0] | |
12:21:35.985 [main] DEBUG org.deeplearning4j.models.word2vec.wordstore.VocabConstructor - Scavenger: Words before: 0; Words after: 0; | |
12:21:35.985 [main] DEBUG org.deeplearning4j.models.word2vec.wordstore.VocabConstructor - Vocab size after truncation: [0], NumWords: [0], sequences parsed: [10000], counter: [0] | |
12:21:36.035 [main] INFO org.deeplearning4j.models.word2vec.wordstore.VocabConstructor - Sequences checked: [10000], Current vocabulary size: [0]; Sequences/sec: [3073.14]; | |
12:21:36.051 [main] INFO org.deeplearning4j.models.embeddings.loader.WordVectorSerializer - Projected memory use for model: [0.00 MB] | |
[Utils] Attempting to create /work/ebx/applications/ml/test-output/Default suite/Default test.html | |
[Utils] Directory /work/ebx/applications/ml/test-output/Default suite exists: true | |
[Utils] Attempting to create /work/ebx/applications/ml/test-output/Default suite/Default test.xml | |
[Utils] Directory /work/ebx/applications/ml/test-output/Default suite exists: true | |
FAILED: testX | |
java.lang.IllegalStateException: You can't fit() model with empty Vocabulary or WeightLookupTable | |
at org.deeplearning4j.models.sequencevectors.SequenceVectors.fit(SequenceVectors.java:220) | |
at com.yoochoose.ml.word2vec.Word2VecRaw.trainModel(Word2VecRaw.java:202) | |
at com.yoochoose.ml.word2vec.Word2VecRaw.findInFileNWordsCloserTo_New(Word2VecRaw.java:76) | |
at com.yoochoose.ml.word2vec.Word2VecRawTest.testX(Word2VecRawTest.java:116) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment