import re | |
import os | |
from scipy.stats import pearsonr | |
from datetime import datetime | |
from gensim.models import CoherenceModel | |
from gensim.corpora.dictionary import Dictionary | |
prefix = "/home/devashish/datasets/Movies/movie" | |
start = datetime.now() | |
texts = [] | |
for fil in os.listdir(prefix): | |
for line in open(prefix + '/' + fil): | |
# lower case all words | |
lowered = line.lower() | |
#remove punctuation and split into seperate words | |
words = re.findall(r'\w+', lowered, flags = re.UNICODE | re.LOCALE) | |
texts.append(words) | |
end = datetime.now() | |
print "Time taken: %s" % (end - start) | |
start = datetime.now() | |
dictionary = Dictionary(texts) | |
corpus = [dictionary.doc2bow(text) for text in texts] | |
end = datetime.now() | |
print "Time taken: %s" % (end - start) | |
print len(corpus) | |
print dictionary | |
topics = [] # list of 100 topics | |
for l in open('/home/devashish/datasets/Movies/topicsMovie.txt'): | |
topics.append([l.split()]) | |
topics.pop(100) | |
human_scores = [] | |
for l in open('/home/devashish/datasets/Movies/goldMovie.txt'): | |
human_scores.append(float(l.strip())) | |
start = datetime.now() | |
c_v = [] | |
for n, topic in enumerate(topics[:1]): | |
print n # for personal monitoring purposes. sorry for this | |
try: | |
cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_v') | |
c_v.append(cm.get_coherence()) | |
except KeyError: | |
raise | |
pass | |
end = datetime.now() | |
print "Time taken: %s" % (end - start) | |
print c_v |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment