Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import re
import os
from scipy.stats import pearsonr
from datetime import datetime
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
prefix = "/home/devashish/datasets/Movies/movie"
start = datetime.now()
texts = []
for fil in os.listdir(prefix):
for line in open(prefix + '/' + fil):
# lower case all words
lowered = line.lower()
#remove punctuation and split into seperate words
words = re.findall(r'\w+', lowered, flags = re.UNICODE | re.LOCALE)
texts.append(words)
end = datetime.now()
print "Time taken: %s" % (end - start)
start = datetime.now()
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
end = datetime.now()
print "Time taken: %s" % (end - start)
print len(corpus)
print dictionary
topics = [] # list of 100 topics
for l in open('/home/devashish/datasets/Movies/topicsMovie.txt'):
topics.append([l.split()])
topics.pop(100)
human_scores = []
for l in open('/home/devashish/datasets/Movies/goldMovie.txt'):
human_scores.append(float(l.strip()))
start = datetime.now()
c_v = []
for n, topic in enumerate(topics[:1]):
print n # for personal monitoring purposes. sorry for this
try:
cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_v')
c_v.append(cm.get_coherence())
except KeyError:
raise
pass
end = datetime.now()
print "Time taken: %s" % (end - start)
print c_v
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment