create interactive umap embedding display for a word2vec model by simple script
Script originally sourced from Peter Baumgartner
and then modified by Lynn Cherny to allow a corpus file,
any gensim w2v model file, and make or read a counts file before the
UMAP display.
The counts are used to focus on the most common words, and more
frequent words show as lighter colors in the UMAP display Peter made.
NOTE: Pip install umap-learn not umap; the import method below fixes a bad install/umap issue.
from collections import Counter
import gensim
from numpy import log10
import pandas as pd
import plotly
import plotly.graph_objs as go
import umap.umap_ as umap
# word2vec model - text format here
model = 'gutenberg_fairyfolk_model.txt'
# concated text corpus in one file
corpus = 'all_fairy_books.txt'
# how many words to display in the interactive
count_cutoff = 10000
# optional: will create counts file for you if needed.
# counts_file = None
counts_file = 'all_fairy_books.txt_counts.txt'
output_html_filename = 'w2v-umap-fairy.html'
# want to filter your words?
#stoplist = ["", "of", "the", "in", "a", "an", "to", "with", "is", "was", "as", "for", "that", "which", "and", "And", "have", "be", "from", "or", "are"]
stoplist = []
def make_counts_file(corpus):
wordcounter = Counter()
with open(corpus) as handle:
for line in handle:
words = line.strip("\n").split(" ")
for word in words:
if not (word in stoplist):
wordcounter[word] += 1
filename = corpus + "_counts.txt"
with open(filename, "w") as handle:
for key, value in wordcounter.most_common(count_cutoff):
handle.write(key + "\t" + str(value) + "\n")
return filename
def read_counts_file(r, sep="\t"):
for line in open(r):
yield line.split(sep)
def build_tooltip(row):
full_string = ['<b>Word:</b> ', row['word'],
'<b>Count:</b> ', "{:,}".format((row['count'])),
'<b>Magnitude:</b> ', str(round(row['log_count']))]
return ''.join(full_string)
# should wrap this in a main and pass in args, but:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(model, binary=False)
vocabulary = set(w2v_model.vocab)
if not counts_file:
counts_file = make_counts_file(corpus)
relevant_words = [(word, count) for (word, count) in read_counts_file(counts_file) if word in vocabulary][:count_cutoff]
model_reduced = w2v_model[[w[0] for w in relevant_words]]
reducer = umap.UMAP(metric='cosine', n_neighbors=15, min_dist=0.05, random_state=42)
embedding = reducer.fit_transform(model_reduced)
d = pd.DataFrame(embedding, columns=['c1', 'c2'])
d['word'] = [w[0] for w in relevant_words]
d['count'] = [int(w[1]) for w in relevant_words]
d['log_count'] = d['count'].apply(log10)
d['tooltip'] = d.apply(build_tooltip, axis=1)
trace = go.Scattergl(
x = d['c1'],
y = d['c2'],
name = 'Embedding',
mode = 'markers',
marker = dict(
color = d['log_count'],
size = 6,
line = dict(
width = 0.5,
layout = dict(title = "Word2Vec 2D UMAP Embeddings for " + corpus,
yaxis = dict(zeroline = False),
xaxis = dict(zeroline = False),
hovermode = 'closest'
fig = go.Figure(data=[trace], layout=layout)
# Will open in browser and save the file offline:
chart = plotly.offline.plot(fig, filename=output_html_filename)
