Skip to content

Instantly share code, notes, and snippets.

@mattiasostmar
Created October 20, 2015 11:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mattiasostmar/f6972d9935c9d9bbc261 to your computer and use it in GitHub Desktop.
Save mattiasostmar/f6972d9935c9d9bbc261 to your computer and use it in GitHub Desktop.
import regex
import logging
import gensim
from gensim import corpora, models
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
class MySentences(object):
def __init__(self, fname):
self.fname = fname
def __iter__(self):
for line in open(self.fname):
cleaned = regex.sub(r"[\:\!\?\.\,]","",line)
tokens = cleaned.split()
yield tokens
sentences = MySentences("/Users/mos/Downloads/SOUtxtAllBigFile.txt")
model = models.Word2Vec(sentences,size=200,window=5,min_count=30,workers=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment