Skip to content

Instantly share code, notes, and snippets.

@himangSharatun
Created May 31, 2018 03:21
Show Gist options
  • Save himangSharatun/f96f765da807689a1d99b66f44e45311 to your computer and use it in GitHub Desktop.
Save himangSharatun/f96f765da807689a1d99b66f44e45311 to your computer and use it in GitHub Desktop.
import pandas as pd
from gensim.models import Word2Vec
import logging
import re
# Diplay log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# Load data
data = pd.read_csv('training-data.csv', header=None)
# Preprocess text
def tokenize(sentence):
remove_dots = re.sub("[.]", "", sentence.lower())
return re.findall("[A-Za-z]{2,}", remove_dots)
sentences = []
for sentence in data[0].values:
sentences.append(tokenize(sentence))
# Train and save word2vec model
model = Word2Vec(sentences, min_count=5, size=100, iter=500, seed=96)
model.save('word2vec.bin')
# Load and test word2vec model
loaded_w2v = Word2Vec.load('word2vec.bin')
print(loaded_w2v.wv.most_similar(positive=['pilot']))
print(loaded_w2v.wv['pilot'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment