Skip to content

Instantly share code, notes, and snippets.

@tokestermw
Last active September 7, 2021 16:57
Show Gist options
  • Star 22 You must be signed in to star a gist
  • Fork 9 You must be signed in to fork a gist
  • Save tokestermw/3588e6fbbb2f03f89798 to your computer and use it in GitHub Desktop.
Save tokestermw/3588e6fbbb2f03f89798 to your computer and use it in GitHub Desktop.
visualization topic models in four different ways
import json
import urlparse
from itertools import chain
flatten = chain.from_iterable
from nltk import word_tokenize
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.tfidfmodel import TfidfModel
# from gensim_btm.models import BTGibbsModel
## get bartstrike data
def url_away(tweet):
string = []
for word in tweet.split():
try:
scheme, netloc, path, params, query, fragment = urlparse.urlparse(word)
except ValueError:
continue
if scheme or netloc:
pass
else:
string.append(path)
return " ".join(string)
def featurize(tweet):
tweet = tweet.lower()
tweet = url_away(tweet)
tokens = word_tokenize(tweet)
tokens = filter(lambda x: len(x) > 2, tokens)
return tokens
with open('data/twitter-bart.json', 'r') as f:
dictionary = Dictionary(featurize(json.loads(line)['text']) for line in f)
class MyCorpus(object):
def __init__(self, data_file, dictionary):
self.data_file = data_file
self.dictionary = dictionary
def __iter__(self):
with open(self.data_file, 'r') as f:
for line in f:
doc = json.loads(line)
features = featurize(doc['text'])
yield self.dictionary.doc2bow(features)
corpus = MyCorpus("./data/twitter-bart.json", dictionary)
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
n_topics = 40
lda = LdaModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics)
#### ------ how to visualize
#### http://tedunderwood.com/2012/11/11/visualizing-topic-models/
#### ------
## word lists
for i in range(0, n_topics):
temp = lda.show_topic(i, 10)
terms = []
for term in temp:
terms.append(term)
print "Top 10 terms for topic #" + str(i) + ": "+ ", ".join([i[1] for i in terms])
"""
Top 10 terms for topic #0: expected, rep, announces, over, looks, ktvu, want, like, guys, will
Top 10 terms for topic #1: sfgate, gridlock, normal, happy, head, real, ferries, over, runs, exactly
Top 10 terms for topic #2: a.m., another, vote, members, other, reason, let, pass, ser…, major
Top 10 terms for topic #3: today, killed, sanfranmag, hopes, ave, expect, delays, running, home, train
"""
## word clouds
from os import path
import matplotlib.pyplot as plt
from wordcloud import WordCloud
def terms_to_wordcounts(terms, multiplier=1000):
return " ".join([" ".join(int(multiplier*i[0]) * [i[1]]) for i in terms])
wordcloud = WordCloud(font_path="Impact_Label.ttf", background_color="black").generate(terms_to_wordcounts(terms), 1000)
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig("terms1")
plt.close()
## topic-words vectors: topics vs. words
from sklearn.feature_extraction import DictVectorizer
def topics_to_vectorspace(n_topics, n_words=100):
rows = []
for i in xrange(n_topics):
temp = lda.show_topic(i, n_words)
row = dict(((i[1],i[0]) for i in temp))
rows.append(row)
return rows
vec = DictVectorizer()
X = vec.fit_transform(topics_to_vectorspace(n_topics))
X.shape
# (40, 2457)
## PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit(X.toarray()).transform(X.toarray())
plt.figure()
for i in xrange(X_pca.shape[0]):
plt.scatter(X_pca[i, 0], X_pca[i, 1], alpha=.5)
plt.text(X_pca[i, 0], X_pca[i, 1], s=' ' + str(i))
plt.title('PCA Topics of Bart Strike Tweets')
plt.savefig("pca_topic")
plt.close()
"""
In [231]: lda.show_topic(19)
Out[231]:
[(0.18418766385173357, u'tuesday'),
(0.10941284772798156, u'over'),
(0.074073551230934093, u'deal'),
(0.057823820985690839, u'reached'),
(0.040004840107066328, u'start'),
(0.014618710538754369, u'chrisfilippi'),
(0.01175792963040383, u'commute'),
(0.010096535268990677, u'buses'),
(0.0099316408990382157, u'abc7newsbayarea'),
(0.0089298280179637094, u'late')]
In [232]: lda.show_topic(21)
Out[232]:
[(0.19463842681026511, u'over'),
(0.039005911200223162, u'rosenbergmerc'),
(0.034922463036658115, u'all'),
(0.029778810358060626, u'thank'),
(0.018876961986207651, u'unions'),
(0.018656417067857364, u'hopefully'),
(0.016893084271683283, u'pissed'),
(0.013589706807636848, u'someone'),
(0.012154548491692339, u'per'),
(0.011787301022370321, u'kron4news')]
In [233]: lda.show_topic(31)
Out[233]:
[(0.073542501022656165, u'tentative'),
(0.068444064522636891, u'reach'),
(0.057170204108103105, u'run'),
(0.054732038737147118, u'trains'),
(0.054298622740944123, u'contract'),
(0.046550628739041838, u'unions'),
(0.041191568872948219, u'deal'),
(0.040003874892020903, u'abc7newsbayarea'),
(0.030594570247699304, u'agreement'),
(0.025459332467351173, u'announcement')]
"""
X_pca = pca.fit(X.T.toarray()).transform(X.T.toarray())
plt.figure()
for i, n in enumerate(vec.get_feature_names()):
plt.scatter(X_pca[i, 0], X_pca[i, 1], alpha=.5)
plt.text(X_pca[i, 0], X_pca[i, 1], s=' ' + n, fontsize=8)
plt.title('PCA Words of Bart Strike Tweets')
plt.savefig("pca_words")
plt.close()
## hierarchical clustering
from scipy.cluster.hierarchy import linkage, dendrogram
plt.figure(figsize=(12,6))
R = dendrogram(linkage(X_pca))
plt.savefig("dendro")
plt.close()
## correlation matrix
from scipy.spatial.distance import pdist, squareform
cor = squareform(pdist(X.toarray(), metric="euclidean"))
plt.figure(figsize=(12,6))
R = dendrogram(linkage(cor))
plt.savefig("corr")
plt.close()
## network
import networkx as nx
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
pca_norm = make_pipeline(PCA(n_components=20), Normalizer(copy=False))
X_pca_norm = pca_norm.fit(X.toarray()).transform(X.toarray())
cor = squareform(pdist(X_pca_norm, metric="euclidean"))
G = nx.Graph()
for i in xrange(cor.shape[0]):
for j in xrange(cor.shape[1]):
if i == j:
G.add_edge(i, j, {"weight":0})
else:
G.add_edge(i, j, {"weight":1.0/cor[i,j]})
edges = [(i, j) for i, j, w in G.edges(data=True) if w['weight'] > .8]
edge_weight=dict([((u,v,),int(d['weight'])) for u,v,d in G.edges(data=True)])
#pos = nx.graphviz_layout(G, prog="twopi") # twopi, neato, circo
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, node_size=100, alpha=.5)
nx.draw_networkx_edges(G, pos, edgelist=edges, width=1)
#nx.draw_networkx_edge_labels(G, pos ,edge_labels=edge_weight)
nx.draw_networkx_labels(G, pos, font_size=8, font_family='sans-serif')
plt.savefig("network")
plt.close()
@koljamaier
Copy link

koljamaier commented Jun 12, 2017

Nice work!
Did you mix up the indices in line 69 and 84 though (for i)?
I worked on different data and had to swap them.

@herlimenezes
Copy link

herlimenezes commented Oct 30, 2017

Nice work, indeed!
At line Line 86 raises an exception if you give aditional parameter 1000.
It worked in this case:
wordcloud = WordCloud(font_path="LiberationMono-Regular.ttf", background_color="black").generate(terms_to_wordcounts(terms))

In order to use Python 3.x you must to replace xrange by range, xrange is not used anymore.

@cyclecycle
Copy link

Thanks a lot for these examples.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment