Skip to content

Instantly share code, notes, and snippets.

@thiagomarzagao
Last active August 29, 2015 14:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thiagomarzagao/1b7ecc3335f758fdf713 to your computer and use it in GitHub Desktop.
Save thiagomarzagao/1b7ecc3335f758fdf713 to your computer and use it in GitHub Desktop.
import re
import math
import pickle
import logging
import gensim
import numpy as np
import pandas as pd
from casenames import casenames
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO, filename = 'output.log')
# set number of topics
num_topics = 50
# set input paths (paths to corpora folder and to corpora files)
ipath = '/home/ubuntu/corpus_a/'
corpora = ipath + 'corpus_a.mm'
# set output path
opath = '/home/ubuntu/output/'
# load corpus
corpus = gensim.corpora.MmCorpus(corpora)
# convert to normalized TF-IDF
tfidf_maker = gensim.models.tfidfmodel.TfidfModel(corpus, normalize = True, wglobal = lambda docfreq, totaldocs: math.log(1.0 * totaldocs / docfreq, math.exp(1)))
tfidf = tfidf_maker[corpus]
# run LSI
lsi = gensim.models.LsiModel(corpus = tfidf, num_topics = num_topics)
# compute dimensionality-reduced data
u = lsi.projection.u
s = np.diag(lsi.projection.s)
v = gensim.matutils.corpus2dense(lsi[tfidf], len(lsi.projection.s)).T / lsi.projection.s
vs = np.dot(v, s)
# save dimensionality-reduced data to disk
vs = pd.DataFrame(vs)
vs['case'] = casenames
vs.set_index('case', inplace = True)
vs.columns = ['topic' + str(t) for t in vs.columns]
vs.to_csv(opath + 'reduced.csv', index_label = 'case', index = True)
# load id2token dictionary
f = open(ipath + 'id2token', mode = 'rb')
words = pickle.load(f)
f.close()
# inspect topic-predictive tokens
topics = {}
topic_num = 0
for string in lsi.print_topics(num_topics = num_topics, num_words = 500):
topic_num += 1
string = string.split('+')
topic_tuples = []
for substring in string:
token_id = int(re.findall(r'"(.*?)"', substring)[0])
subsubstring = substring.split('*')
coefficient = float(subsubstring[0].strip().replace("'", ""))
topic_tuples.append((coefficient, words[token_id]))
topics[topic_num] = topic_tuples
# save topic-predictive tokens to disk
f = open(opath + 'weights', mode = 'wb')
pickle.dump(topics, f)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment