Last active
February 17, 2017 08:39
-
-
Save vitojph/763cb0b150b5634dd89b41647c77c116 to your computer and use it in GitHub Desktop.
reads a collection of text files containing Wikipedia articles and creates a word2vec model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import gensim | |
import logging | |
import os | |
import bz2 | |
class WikiCorpus(object): | |
'''Corpus class which allows to read recursively a set of directories | |
containing bzip2'ed text documents (Wikipedia articles)''' | |
def __init__(self, directory): | |
self.directory = directory | |
def __iter__(self): | |
for subdir, dirs, files in os.walk(self.directory): | |
for f in files: | |
for line in bz2.open(os.path.join(subdir, f), 'rt'): | |
yield line.split() | |
if __name__ == "__main__": | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
CORPUSDIR = '/data/es.wiki.txt/' | |
sentences = WikiCorpus(CORPUSDIR) | |
logging.info('Building vocabulary and training') | |
model = gensim.models.Word2Vec(sentences, min_count=10, size=150, workers=2) | |
# the model can be also trained in two different steps, as in | |
#model = gensim.models.Word2Vec() # creates an empty mode | |
#model.build_vocab(sentences) # 1st step to create the vocabulary | |
#model.train(sentences) # 2nd step to create vectors | |
logging.info('Saving the model...') | |
model.save('/data/eswiki-150.w2v') | |
logging.info('Done') | |
logging.info('Loading the model...') | |
model = gensim.models.Word2Vec.load('/data/eswiki-150.w2v') | |
logging.info('The model contains', model.corpus_count, 'items') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment