Created
May 8, 2014 19:56
-
-
Save thiagomarzagao/459ebc07a0abe32407bd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pickle | |
import gensim | |
import logging | |
import pandas as pd | |
from casenames import casenames | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO, filename = 'output.log') | |
# set number of topics | |
num_topics = 50 | |
# set input paths (paths to corpora folder and to corpora files) | |
ipath = '/home/ubuntu/corpus_a/' | |
corpora = ipath + 'corpus_a.mm' | |
# set output path | |
opath = '/home/ubuntu/output/' | |
# load corpus | |
corpus = gensim.corpora.MmCorpus(corpora) | |
# run LDA | |
lda = gensim.models.ldamodel.LdaModel(corpus = corpus, num_topics = num_topics, alpha = 'auto', chunksize = 20, iterations = 1000) | |
#lda.save(opath + 'lda_params') | |
# load id2token dictionary | |
f = open(ipath + 'id2token', mode = 'rb') | |
id2token = pickle.load(f) | |
f.close() | |
# retrieve topic weights | |
topics = {} | |
topic_num = 0 | |
for input_list in lda.show_topics(topics = num_topics, topn = 500, formatted = False): | |
topic_num += 1 | |
output_list = [] | |
for tup in input_list: | |
output_list.append((int(tup[1]), id2token[int(tup[1])], tup[0])) | |
topics[topic_num] = output_list | |
# save topic weights to disk | |
f = open(opath + 'weights', mode = 'wb') | |
pickle.dump(topics, f) | |
f.close() | |
# put reduced data in pickle format and save to file | |
#reduced = [l for l in lda[corpus]] | |
#f = open(opath + 'reduced', mode = 'wb') | |
#pickle.dump(reduced, f) | |
#f.close() | |
# put reduced data in DataFrame format and save to file | |
data = [l for l in lda[corpus]] | |
df = pd.DataFrame(columns = ['topics']) | |
for doc, docname in zip(data, casenames): | |
vector = pd.DataFrame(doc, columns = ['topics', docname]) | |
df = pd.merge(df, vector, how = 'outer', on = 'topics') | |
df = df.T | |
df.columns = ['topic' + str(int(topic_num)) for topic_num in df.iloc[0]] | |
df = df[1:] | |
df.to_csv(opath + 'reduced.csv', index_label = 'case', index = True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment