Skip to content

Instantly share code, notes, and snippets.

@msubhash
Last active January 3, 2018 15:22
Show Gist options
  • Save msubhash/468390e22ed660a1eafdae62a33c965c to your computer and use it in GitHub Desktop.
Save msubhash/468390e22ed660a1eafdae62a33c965c to your computer and use it in GitHub Desktop.
Word2Vec on Catalog description
from gensim.models import Phrases
from gensim.models import Word2Vec
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
sentences = []
fname = "model.mm"
input_file="stories_15k.txt"
#Remove stop words. Not doing all the data cleaning here.
def preprocess(sentence):
sentence = sentence.lower()
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
filtered_words = [w for w in tokens if not w in stopwords.words('english')]
return " ".join(filtered_words)
with open(input_file) as f:
data = f.readlines()
#preprocess - remove stop words.
for sentence in data:
sentences.append(sentence)
#Initialize model
model = Word2Vec([s.split() for s in sentences], size=100, window=5, min_count=5, workers=4)
#Save the model
model.save(fname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment