smellslikeml/w2v_tSNE_plot.py

## w2v_tSNE_plot.py
#!/usr/bin/python
import sys
import re
import string
from gensim.models import Word2Vec, Phrases
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from itertools import islice

fl = sys.argv[1]

stopwords = stopwords.words('english')
stop_dict = {tm: '' for tm in stopwords if 'no' not in tm}

def txt_cln(tok):
    tok = re.sub(r'\d+\:\d+', 'clocktime', tok)
    tok = re.sub(r'(\d+/\d+/\d+)', 'datetime', tok)
    tok = re.sub(r'(\d+-\d+-\d+)', 'datetime', tok)
    tok = re.sub(r'w+\.[a-z0-9]+\.com', 'url', tok)
    tok = re.sub(r'(?<!\d)\.(?!\d)', '', tok)
    tok = tok.strip()
    tok = tok.replace('/', ' ')
    trans_map = str.maketrans('', '', '_.<>=')
    tok = tok.translate(trans_map)
    try:
        tok = stop_dict[tok]
    except KeyError:
        tok = tok
    if tok and tok != ' ':
        return tok.lower()

class ClnSent(object):
    def __init__(self, fname):
        self.fname = fname
        self.ll = ''

    def __iter__(self):
        for line in open(self.fname):
            if self.ll != line:
                outlist = [x for x in map(txt_cln, line.split()) if x]
                if outlist:
                    yield outlist
            self.ll = line

window = 3
size = 64
nn = 50000000
flg = 0
with open(fl, 'r') as f:
    while True:
        next_n_lines = list(islice(f, nn))
        if not next_n_lines:
            break
        sentences = list(map(txt_cln, next_n_lines))
        sentences = [x.split() for x in sentences if x]
        if flg == 0:
            model = Word2Vec(sentences, window=window, size=size, workers=6)
            model.save('w2v_model_blk_' + str(window) + '_' + str(size))
        else:
            model = Word2Vec.load('w2v_model_blk' + str(window) + '_' + str(size))
            model = Word2Vec(sentences, window=window, size=size, workers=6)
            model.save('w2v_model_blk' + str(window) + '_' + str(size))

tsne = TSNE(perplexity=5, n_components=2, init='pca', n_iter=10000)
plot_only = 500
low_dim_embs = tsne.fit_transform(model.wv.syn0[:plot_only])
labels = model.wv.index2word[:plot_only]

def plot_with_labels(low_dim_embs, labels, filename='tsne_' + str(window) + '_' + str(size) + '.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
        xy=(x, y),
        xytext=(5, 2),
        textcoords='offset points', ha='right', va='bottom')
    plt.savefig(filename)

plot_with_labels(low_dim_embs, labels)
	#!/usr/bin/python
	import sys
	import re
	import string
	from gensim.models import Word2Vec, Phrases
	from sklearn.manifold import TSNE
	import matplotlib.pyplot as plt
	from nltk.corpus import stopwords
	from itertools import islice

	fl = sys.argv[1]

	stopwords = stopwords.words('english')
	stop_dict = {tm: '' for tm in stopwords if 'no' not in tm}

	def txt_cln(tok):
	tok = re.sub(r'\d+\:\d+', 'clocktime', tok)
	tok = re.sub(r'(\d+/\d+/\d+)', 'datetime', tok)
	tok = re.sub(r'(\d+-\d+-\d+)', 'datetime', tok)
	tok = re.sub(r'w+\.[a-z0-9]+\.com', 'url', tok)
	tok = re.sub(r'(?<!\d)\.(?!\d)', '', tok)
	tok = tok.strip()
	tok = tok.replace('/', ' ')
	trans_map = str.maketrans('', '', '_.<>=')
	tok = tok.translate(trans_map)
	try:
	tok = stop_dict[tok]
	except KeyError:
	tok = tok
	if tok and tok != ' ':
	return tok.lower()

	class ClnSent(object):
	def __init__(self, fname):
	self.fname = fname
	self.ll = ''

	def __iter__(self):
	for line in open(self.fname):
	if self.ll != line:
	outlist = [x for x in map(txt_cln, line.split()) if x]
	if outlist:
	yield outlist
	self.ll = line

	window = 3
	size = 64
	nn = 50000000
	flg = 0
	with open(fl, 'r') as f:
	while True:
	next_n_lines = list(islice(f, nn))
	if not next_n_lines:
	break
	sentences = list(map(txt_cln, next_n_lines))
	sentences = [x.split() for x in sentences if x]
	if flg == 0:
	model = Word2Vec(sentences, window=window, size=size, workers=6)
	model.save('w2v_model_blk_' + str(window) + '_' + str(size))
	else:
	model = Word2Vec.load('w2v_model_blk' + str(window) + '_' + str(size))
	model = Word2Vec(sentences, window=window, size=size, workers=6)
	model.save('w2v_model_blk' + str(window) + '_' + str(size))

	tsne = TSNE(perplexity=5, n_components=2, init='pca', n_iter=10000)
	plot_only = 500
	low_dim_embs = tsne.fit_transform(model.wv.syn0[:plot_only])
	labels = model.wv.index2word[:plot_only]

	def plot_with_labels(low_dim_embs, labels, filename='tsne_' + str(window) + '_' + str(size) + '.png'):
	assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
	plt.figure(figsize=(18, 18)) # in inches
	for i, label in enumerate(labels):
	x, y = low_dim_embs[i, :]
	plt.scatter(x, y)
	plt.annotate(label,
	xy=(x, y),
	xytext=(5, 2),
	textcoords='offset points', ha='right', va='bottom')
	plt.savefig(filename)

	plot_with_labels(low_dim_embs, labels)