Last active
October 25, 2018 01:17
-
-
Save smellslikeml/72099cab8055e20c47a6682fda943580 to your computer and use it in GitHub Desktop.
ntlk, gensim word2vec, sklean t-SNE embedding, matplotlib
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
import re | |
import string | |
from gensim.models import Word2Vec, Phrases | |
from sklearn.manifold import TSNE | |
import matplotlib.pyplot as plt | |
from nltk.corpus import stopwords | |
from itertools import islice | |
fl = sys.argv[1] | |
stopwords = stopwords.words('english') | |
stop_dict = {tm: '' for tm in stopwords if 'no' not in tm} | |
def txt_cln(tok): | |
tok = re.sub(r'\d+\:\d+', 'clocktime', tok) | |
tok = re.sub(r'(\d+/\d+/\d+)', 'datetime', tok) | |
tok = re.sub(r'(\d+-\d+-\d+)', 'datetime', tok) | |
tok = re.sub(r'w+\.[a-z0-9]+\.com', 'url', tok) | |
tok = re.sub(r'(?<!\d)\.(?!\d)', '', tok) | |
tok = tok.strip() | |
tok = tok.replace('/', ' ') | |
trans_map = str.maketrans('', '', '_.<>=') | |
tok = tok.translate(trans_map) | |
try: | |
tok = stop_dict[tok] | |
except KeyError: | |
tok = tok | |
if tok and tok != ' ': | |
return tok.lower() | |
class ClnSent(object): | |
def __init__(self, fname): | |
self.fname = fname | |
self.ll = '' | |
def __iter__(self): | |
for line in open(self.fname): | |
if self.ll != line: | |
outlist = [x for x in map(txt_cln, line.split()) if x] | |
if outlist: | |
yield outlist | |
self.ll = line | |
window = 3 | |
size = 64 | |
nn = 50000000 | |
flg = 0 | |
with open(fl, 'r') as f: | |
while True: | |
next_n_lines = list(islice(f, nn)) | |
if not next_n_lines: | |
break | |
sentences = list(map(txt_cln, next_n_lines)) | |
sentences = [x.split() for x in sentences if x] | |
if flg == 0: | |
model = Word2Vec(sentences, window=window, size=size, workers=6) | |
model.save('w2v_model_blk_' + str(window) + '_' + str(size)) | |
else: | |
model = Word2Vec.load('w2v_model_blk' + str(window) + '_' + str(size)) | |
model = Word2Vec(sentences, window=window, size=size, workers=6) | |
model.save('w2v_model_blk' + str(window) + '_' + str(size)) | |
tsne = TSNE(perplexity=5, n_components=2, init='pca', n_iter=10000) | |
plot_only = 500 | |
low_dim_embs = tsne.fit_transform(model.wv.syn0[:plot_only]) | |
labels = model.wv.index2word[:plot_only] | |
def plot_with_labels(low_dim_embs, labels, filename='tsne_' + str(window) + '_' + str(size) + '.png'): | |
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings" | |
plt.figure(figsize=(18, 18)) # in inches | |
for i, label in enumerate(labels): | |
x, y = low_dim_embs[i, :] | |
plt.scatter(x, y) | |
plt.annotate(label, | |
xy=(x, y), | |
xytext=(5, 2), | |
textcoords='offset points', ha='right', va='bottom') | |
plt.savefig(filename) | |
plot_with_labels(low_dim_embs, labels) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment