Skip to content

Instantly share code, notes, and snippets.

@devashishd12
Created July 14, 2016 05:48
Show Gist options
  • Save devashishd12/b66a526d39faa41e21ac24a9e3f28193 to your computer and use it in GitHub Desktop.
Save devashishd12/b66a526d39faa41e21ac24a9e3f28193 to your computer and use it in GitHub Desktop.
import re
import os
from scipy.stats import pearsonr
from datetime import datetime
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
prefix = "/home/devashish/datasets/Movies/movie"
start = datetime.now()
texts = []
for fil in os.listdir(prefix):
for line in open(prefix + '/' + fil):
# lower case all words
lowered = line.lower()
#remove punctuation and split into seperate words
words = re.findall(r'\w+', lowered, flags = re.UNICODE | re.LOCALE)
texts.append(words)
end = datetime.now()
print "Time taken: %s" % (end - start)
start = datetime.now()
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
end = datetime.now()
print "Time taken: %s" % (end - start)
print len(corpus)
print dictionary
topics = [] # list of 100 topics
for l in open('/home/devashish/datasets/Movies/topicsMovie.txt'):
topics.append([l.split()])
topics.pop(100)
human_scores = []
for l in open('/home/devashish/datasets/Movies/goldMovie.txt'):
human_scores.append(float(l.strip()))
start = datetime.now()
c_v = []
for n, topic in enumerate(topics[:1]):
print n # for personal monitoring purposes. sorry for this
try:
cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_v')
c_v.append(cm.get_coherence())
except KeyError:
raise
pass
end = datetime.now()
print "Time taken: %s" % (end - start)
print c_v
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment