Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
import re
import math
import pickle
import logging
import gensim
import numpy as np
import pandas as pd
from casenames import casenames
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO, filename = 'output.log')
# set number of topics
num_topics = 50
# set input paths (paths to corpora folder and to corpora files)
ipath = '/home/ubuntu/corpus_a/'
corpora = ipath + 'corpus_a.mm'
# set output path
opath = '/home/ubuntu/output/'
# load corpus
corpus = gensim.corpora.MmCorpus(corpora)
# convert to normalized TF-IDF
tfidf_maker = gensim.models.tfidfmodel.TfidfModel(corpus, normalize = True, wglobal = lambda docfreq, totaldocs: math.log(1.0 * totaldocs / docfreq, math.exp(1)))
tfidf = tfidf_maker[corpus]
# run LSI
lsi = gensim.models.LsiModel(corpus = tfidf, num_topics = num_topics)
# compute dimensionality-reduced data
u = lsi.projection.u
s = np.diag(lsi.projection.s)
v = gensim.matutils.corpus2dense(lsi[tfidf], len(lsi.projection.s)).T / lsi.projection.s
vs = np.dot(v, s)
# save dimensionality-reduced data to disk
vs = pd.DataFrame(vs)
vs['case'] = casenames
vs.set_index('case', inplace = True)
vs.columns = ['topic' + str(t) for t in vs.columns]
vs.to_csv(opath + 'reduced.csv', index_label = 'case', index = True)
# load id2token dictionary
f = open(ipath + 'id2token', mode = 'rb')
words = pickle.load(f)
f.close()
# inspect topic-predictive tokens
topics = {}
topic_num = 0
for string in lsi.print_topics(num_topics = num_topics, num_words = 500):
topic_num += 1
string = string.split('+')
topic_tuples = []
for substring in string:
token_id = int(re.findall(r'"(.*?)"', substring)[0])
subsubstring = substring.split('*')
coefficient = float(subsubstring[0].strip().replace("'", ""))
topic_tuples.append((coefficient, words[token_id]))
topics[topic_num] = topic_tuples
# save topic-predictive tokens to disk
f = open(opath + 'weights', mode = 'wb')
pickle.dump(topics, f)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment