Skip to content

Instantly share code, notes, and snippets.

Last active October 25, 2018 01:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save smellslikeml/72099cab8055e20c47a6682fda943580 to your computer and use it in GitHub Desktop.
Save smellslikeml/72099cab8055e20c47a6682fda943580 to your computer and use it in GitHub Desktop.
ntlk, gensim word2vec, sklean t-SNE embedding, matplotlib
import sys
import re
import string
from gensim.models import Word2Vec, Phrases
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from itertools import islice
fl = sys.argv[1]
stopwords = stopwords.words('english')
stop_dict = {tm: '' for tm in stopwords if 'no' not in tm}
def txt_cln(tok):
tok = re.sub(r'\d+\:\d+', 'clocktime', tok)
tok = re.sub(r'(\d+/\d+/\d+)', 'datetime', tok)
tok = re.sub(r'(\d+-\d+-\d+)', 'datetime', tok)
tok = re.sub(r'w+\.[a-z0-9]+\.com', 'url', tok)
tok = re.sub(r'(?<!\d)\.(?!\d)', '', tok)
tok = tok.strip()
tok = tok.replace('/', ' ')
trans_map = str.maketrans('', '', '_.<>=')
tok = tok.translate(trans_map)
tok = stop_dict[tok]
except KeyError:
tok = tok
if tok and tok != ' ':
return tok.lower()
class ClnSent(object):
def __init__(self, fname):
self.fname = fname
self.ll = ''
def __iter__(self):
for line in open(self.fname):
if self.ll != line:
outlist = [x for x in map(txt_cln, line.split()) if x]
if outlist:
yield outlist
self.ll = line
window = 3
size = 64
nn = 50000000
flg = 0
with open(fl, 'r') as f:
while True:
next_n_lines = list(islice(f, nn))
if not next_n_lines:
sentences = list(map(txt_cln, next_n_lines))
sentences = [x.split() for x in sentences if x]
if flg == 0:
model = Word2Vec(sentences, window=window, size=size, workers=6)'w2v_model_blk_' + str(window) + '_' + str(size))
model = Word2Vec.load('w2v_model_blk' + str(window) + '_' + str(size))
model = Word2Vec(sentences, window=window, size=size, workers=6)'w2v_model_blk' + str(window) + '_' + str(size))
tsne = TSNE(perplexity=5, n_components=2, init='pca', n_iter=10000)
plot_only = 500
low_dim_embs = tsne.fit_transform(model.wv.syn0[:plot_only])
labels = model.wv.index2word[:plot_only]
def plot_with_labels(low_dim_embs, labels, filename='tsne_' + str(window) + '_' + str(size) + '.png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18)) # in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
xy=(x, y),
xytext=(5, 2),
textcoords='offset points', ha='right', va='bottom')
plot_with_labels(low_dim_embs, labels)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment