Created
July 4, 2017 09:54
-
-
Save MCardus/ea45d30b7748a9d7c0382c67e4003d32 to your computer and use it in GitHub Desktop.
doc2vec_training
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def trainModel(inputPath: String) = { | |
log.info("Training model from data in path: " + inputPath) | |
val file: File = new File(inputPath) | |
val iter = new BasicLineIterator(file) | |
val cache = new AbstractCache[VocabWord] | |
val tokenizer = new DefaultTokenizerFactory | |
tokenizer.setTokenPreProcessor(new CommonPreprocessor) | |
try { | |
_paragraphVectors = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).epochs(1).layerSize(100).learningRate(0.025).windowSize(5).iterate(iter).trainWordVectors(false).vocabCache(cache).tokenizerFactory(tokenizer).sampling(0).build | |
paragraphVectors.fit() | |
log.info("Model correctly trained") | |
} | |
catch { | |
case e: Exception => log.error("Model could not be trained. Trace: " + e.getMessage) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment