Forked from maxbellec/word2vec_tf_idf_from_wikipeida.py
Created
October 30, 2017 14:20
-
-
Save shahbazsyed/54b2ab1df77b77f56e164823e5bc1ad5 to your computer and use it in GitHub Desktop.
Create Word2Vec from wikipedia with gensim
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import multiprocessing | |
from gensim.corpora.wikicorpus import WikiCorpus | |
from gensim.models.word2vec import Word2Vec | |
from gensim.models import TfidfModel | |
# logging is important to get the state of the functions | |
import logging | |
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') | |
logging.root.setLevel(level=logging.INFO) | |
wiki = WikiCorpus('data/enwiki-20170101-pages-articles-multistream.xml.bz2', lemmatize=False) | |
tfidf = TfidfModel(wiki) | |
# save for persistence | |
wiki.save('wiki.corpus) | |
tfidf.save('wiki.tfidf.model') | |
# word2vec | |
class MySentences(object): | |
def __iter__(self): | |
for text in wiki.get_texts(): | |
yield [word.decode() for word in text] | |
sentences = MySentences() | |
params = {'size': 300, 'window': 10, 'min_count': 40, | |
'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1e-3,} | |
word2vec = Word2Vec(sentences, **params) | |
word2vec.save('wiki.word2vec.model') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment