Skip to content

Instantly share code, notes, and snippets.

@smellslikeml
Last active October 25, 2018 01:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save smellslikeml/72099cab8055e20c47a6682fda943580 to your computer and use it in GitHub Desktop.
Save smellslikeml/72099cab8055e20c47a6682fda943580 to your computer and use it in GitHub Desktop.
ntlk, gensim word2vec, sklean t-SNE embedding, matplotlib
#!/usr/bin/python
import sys
import re
import string
from gensim.models import Word2Vec, Phrases
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from itertools import islice
fl = sys.argv[1]
stopwords = stopwords.words('english')
stop_dict = {tm: '' for tm in stopwords if 'no' not in tm}
def txt_cln(tok):
tok = re.sub(r'\d+\:\d+', 'clocktime', tok)
tok = re.sub(r'(\d+/\d+/\d+)', 'datetime', tok)
tok = re.sub(r'(\d+-\d+-\d+)', 'datetime', tok)
tok = re.sub(r'w+\.[a-z0-9]+\.com', 'url', tok)
tok = re.sub(r'(?<!\d)\.(?!\d)', '', tok)
tok = tok.strip()
tok = tok.replace('/', ' ')
trans_map = str.maketrans('', '', '_.<>=')
tok = tok.translate(trans_map)
try:
tok = stop_dict[tok]
except KeyError:
tok = tok
if tok and tok != ' ':
return tok.lower()
class ClnSent(object):
def __init__(self, fname):
self.fname = fname
self.ll = ''
def __iter__(self):
for line in open(self.fname):
if self.ll != line:
outlist = [x for x in map(txt_cln, line.split()) if x]
if outlist:
yield outlist
self.ll = line
window = 3
size = 64
nn = 50000000
flg = 0
with open(fl, 'r') as f:
while True:
next_n_lines = list(islice(f, nn))
if not next_n_lines:
break
sentences = list(map(txt_cln, next_n_lines))
sentences = [x.split() for x in sentences if x]
if flg == 0:
model = Word2Vec(sentences, window=window, size=size, workers=6)
model.save('w2v_model_blk_' + str(window) + '_' + str(size))
else:
model = Word2Vec.load('w2v_model_blk' + str(window) + '_' + str(size))
model = Word2Vec(sentences, window=window, size=size, workers=6)
model.save('w2v_model_blk' + str(window) + '_' + str(size))
tsne = TSNE(perplexity=5, n_components=2, init='pca', n_iter=10000)
plot_only = 500
low_dim_embs = tsne.fit_transform(model.wv.syn0[:plot_only])
labels = model.wv.index2word[:plot_only]
def plot_with_labels(low_dim_embs, labels, filename='tsne_' + str(window) + '_' + str(size) + '.png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18)) # in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points', ha='right', va='bottom')
plt.savefig(filename)
plot_with_labels(low_dim_embs, labels)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment