Skip to content

Instantly share code, notes, and snippets.

@vitojph
Last active February 17, 2017 08:39
Show Gist options
  • Save vitojph/763cb0b150b5634dd89b41647c77c116 to your computer and use it in GitHub Desktop.
Save vitojph/763cb0b150b5634dd89b41647c77c116 to your computer and use it in GitHub Desktop.
reads a collection of text files containing Wikipedia articles and creates a word2vec model
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import gensim
import logging
import os
import bz2
class WikiCorpus(object):
'''Corpus class which allows to read recursively a set of directories
containing bzip2'ed text documents (Wikipedia articles)'''
def __init__(self, directory):
self.directory = directory
def __iter__(self):
for subdir, dirs, files in os.walk(self.directory):
for f in files:
for line in bz2.open(os.path.join(subdir, f), 'rt'):
yield line.split()
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
CORPUSDIR = '/data/es.wiki.txt/'
sentences = WikiCorpus(CORPUSDIR)
logging.info('Building vocabulary and training')
model = gensim.models.Word2Vec(sentences, min_count=10, size=150, workers=2)
# the model can be also trained in two different steps, as in
#model = gensim.models.Word2Vec() # creates an empty mode
#model.build_vocab(sentences) # 1st step to create the vocabulary
#model.train(sentences) # 2nd step to create vectors
logging.info('Saving the model...')
model.save('/data/eswiki-150.w2v')
logging.info('Done')
logging.info('Loading the model...')
model = gensim.models.Word2Vec.load('/data/eswiki-150.w2v')
logging.info('The model contains', model.corpus_count, 'items')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment