Skip to content

Instantly share code, notes, and snippets.

@thiagomarzagao
Created May 8, 2014 19:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thiagomarzagao/459ebc07a0abe32407bd to your computer and use it in GitHub Desktop.
Save thiagomarzagao/459ebc07a0abe32407bd to your computer and use it in GitHub Desktop.
import pickle
import gensim
import logging
import pandas as pd
from casenames import casenames
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO, filename = 'output.log')
# set number of topics
num_topics = 50
# set input paths (paths to corpora folder and to corpora files)
ipath = '/home/ubuntu/corpus_a/'
corpora = ipath + 'corpus_a.mm'
# set output path
opath = '/home/ubuntu/output/'
# load corpus
corpus = gensim.corpora.MmCorpus(corpora)
# run LDA
lda = gensim.models.ldamodel.LdaModel(corpus = corpus, num_topics = num_topics, alpha = 'auto', chunksize = 20, iterations = 1000)
#lda.save(opath + 'lda_params')
# load id2token dictionary
f = open(ipath + 'id2token', mode = 'rb')
id2token = pickle.load(f)
f.close()
# retrieve topic weights
topics = {}
topic_num = 0
for input_list in lda.show_topics(topics = num_topics, topn = 500, formatted = False):
topic_num += 1
output_list = []
for tup in input_list:
output_list.append((int(tup[1]), id2token[int(tup[1])], tup[0]))
topics[topic_num] = output_list
# save topic weights to disk
f = open(opath + 'weights', mode = 'wb')
pickle.dump(topics, f)
f.close()
# put reduced data in pickle format and save to file
#reduced = [l for l in lda[corpus]]
#f = open(opath + 'reduced', mode = 'wb')
#pickle.dump(reduced, f)
#f.close()
# put reduced data in DataFrame format and save to file
data = [l for l in lda[corpus]]
df = pd.DataFrame(columns = ['topics'])
for doc, docname in zip(data, casenames):
vector = pd.DataFrame(doc, columns = ['topics', docname])
df = pd.merge(df, vector, how = 'outer', on = 'topics')
df = df.T
df.columns = ['topic' + str(int(topic_num)) for topic_num in df.iloc[0]]
df = df[1:]
df.to_csv(opath + 'reduced.csv', index_label = 'case', index = True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment