wottpal/word2vecTSNE.py

## word2vecTSNE.py
#! -*- coding: utf-8 -*-

import sys
import time
import numpy as np
import gensim
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
from sklearn.manifold import TSNE
from sklearn.cluster import MiniBatchKMeans
from scipy.spatial import distance
import random
import math
from adjustText import adjust_text


VOCAB_SIZE = 10000      # MAX 3.000.000


# Load the Google-News Model (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM)
model_path = "./GoogleNews-vectors-negative300.bin"
model = gensim.models.KeyedVectors.load_word2vec_format(
    model_path, binary=True)
wv = model.wv.syn0              # word vectors (wv)
vocabulary = model.wv.vocab     # vocabulary (lexicon)


# Run TSNE
tsne = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
Y = tsne.fit_transform(wv[:VOCAB_SIZE, :])
word_positions = zip(vocabulary, Y[:, 0], Y[:, 1])


def get_word_position(word):
    """Finds words location in the vocabulary and tsne-plot"""
    try:
        if word in vocabulary:
            word_index = list(vocabulary).index(word)
            word_x = Y[word_index, 0]
            word_y = Y[word_index, 1]
            print(f"'{word}' at ({word_x},{word_y})")
            return word_index, word_x, word_y
        else:
            return None, None, None
            # raise ValueError(f"Can't find given word '{word}'")
    except:
        return None, None, None


def get_random_color(pastel_factor=0.9):
    return [(x+pastel_factor)/(1.0+pastel_factor) for x in [random.uniform(0, 1.0) for i in [1, 2, 3]]]


def save_word_plot(word, max_dist):
    """Plots the tSNE surroundings of a given word with a given distance."""
    plt.figure()
    word_index, word_x, word_y = get_word_position(word)
    if word_index is None or word_x is None or word_y is None:
        return

    # Add words with a minimum location to the word of interest to a new list
    vocab_new = []
    x_new = []
    y_new = []
    for w, x, y in zip(vocabulary, Y[:, 0], Y[:, 1]):
        dist = distance.euclidean([word_x, word_y], [x, y])
        if dist < max_dist:
            vocab_new.append(w)
            x_new.append(x)
            y_new.append(y)

    # Plot
    plt.scatter(x_new, y_new, c=get_random_color())
    texts = []
    for label, x, y in zip(vocab_new, x_new, y_new):
        texts.append(plt.text(x, y, label))
        # plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoord
        # s='offset points')
    # plt.show()
    adjust_text(texts)
    plt.axis('off')
    # plt.savefig(f'results/plot_{word}_{max_dist}.png', bbox_inches='tight')
    filename = f'results/plot_{word}_{max_dist}.pdf'
    plt.savefig(filename, bbox_inches='tight')
    plt.close()
    print(f"Saved '{filename}'")


def save_full_plot(show_labels = False, kmeans_clusters = 8):
    """Plots the full tSNE world"""
    bw = kmeans_clusters == 1
    plt.figure()
    kmeans = MiniBatchKMeans(n_clusters=kmeans_clusters)
    labels = kmeans.fit_predict(Y)
    colors = [get_random_color() for x in range(kmeans_clusters)]
    if bw: colors = ["gray"]
    for idx, word_pos in enumerate(word_positions):
        label, x, y = word_pos
        label_color = colors[labels[idx]]
        plt.plot(x, y, color=label_color, marker='o', markersize=1)
        if show_labels: plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.axis('off')
    filename = f'results/full_plot_{"labels_" if show_labels else ""}{VOCAB_SIZE}_{kmeans_clusters}.pdf'
    plt.savefig(filename, bbox_inches='tight')
    plt.close()
    print(f"Saved '{filename}'")
	#! -- coding: utf-8 --

	import sys
	import time
	import numpy as np
	import gensim
	import matplotlib.pyplot as plt
	from matplotlib import font_manager, rc
	from sklearn.manifold import TSNE
	from sklearn.cluster import MiniBatchKMeans
	from scipy.spatial import distance
	import random
	import math
	from adjustText import adjust_text


	VOCAB_SIZE = 10000 # MAX 3.000.000


	# Load the Google-News Model (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM)
	model_path = "./GoogleNews-vectors-negative300.bin"
	model = gensim.models.KeyedVectors.load_word2vec_format(
	model_path, binary=True)
	wv = model.wv.syn0 # word vectors (wv)
	vocabulary = model.wv.vocab # vocabulary (lexicon)


	# Run TSNE
	tsne = TSNE(n_components=2, random_state=0)
	np.set_printoptions(suppress=True)
	Y = tsne.fit_transform(wv[:VOCAB_SIZE, :])
	word_positions = zip(vocabulary, Y[:, 0], Y[:, 1])


	def get_word_position(word):
	"""Finds words location in the vocabulary and tsne-plot"""
	try:
	if word in vocabulary:
	word_index = list(vocabulary).index(word)
	word_x = Y[word_index, 0]
	word_y = Y[word_index, 1]
	print(f"'{word}' at ({word_x},{word_y})")
	return word_index, word_x, word_y
	else:
	return None, None, None
	# raise ValueError(f"Can't find given word '{word}'")
	except:
	return None, None, None


	def get_random_color(pastel_factor=0.9):
	return [(x+pastel_factor)/(1.0+pastel_factor) for x in [random.uniform(0, 1.0) for i in [1, 2, 3]]]


	def save_word_plot(word, max_dist):
	"""Plots the tSNE surroundings of a given word with a given distance."""
	plt.figure()
	word_index, word_x, word_y = get_word_position(word)
	if word_index is None or word_x is None or word_y is None:
	return

	# Add words with a minimum location to the word of interest to a new list
	vocab_new = []
	x_new = []
	y_new = []
	for w, x, y in zip(vocabulary, Y[:, 0], Y[:, 1]):
	dist = distance.euclidean([word_x, word_y], [x, y])
	if dist < max_dist:
	vocab_new.append(w)
	x_new.append(x)
	y_new.append(y)

	# Plot
	plt.scatter(x_new, y_new, c=get_random_color())
	texts = []
	for label, x, y in zip(vocab_new, x_new, y_new):
	texts.append(plt.text(x, y, label))
	# plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoord
	# s='offset points')
	# plt.show()
	adjust_text(texts)
	plt.axis('off')
	# plt.savefig(f'results/plot_{word}_{max_dist}.png', bbox_inches='tight')
	filename = f'results/plot_{word}_{max_dist}.pdf'
	plt.savefig(filename, bbox_inches='tight')
	plt.close()
	print(f"Saved '{filename}'")


	def save_full_plot(show_labels = False, kmeans_clusters = 8):
	"""Plots the full tSNE world"""
	bw = kmeans_clusters == 1
	plt.figure()
	kmeans = MiniBatchKMeans(n_clusters=kmeans_clusters)
	labels = kmeans.fit_predict(Y)
	colors = [get_random_color() for x in range(kmeans_clusters)]
	if bw: colors = ["gray"]
	for idx, word_pos in enumerate(word_positions):
	label, x, y = word_pos
	label_color = colors[labels[idx]]
	plt.plot(x, y, color=label_color, marker='o', markersize=1)
	if show_labels: plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
	plt.axis('off')
	filename = f'results/full_plot_{"labels_" if show_labels else ""}{VOCAB_SIZE}_{kmeans_clusters}.pdf'
	plt.savefig(filename, bbox_inches='tight')
	plt.close()
	print(f"Saved '{filename}'")