Created
July 14, 2016 11:53
-
-
Save devashishd12/22c77be7ee5b10c54d53560951e33fbe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The line_profiler extension is already loaded. To reload it, use:\n", | |
" %reload_ext line_profiler\n" | |
] | |
} | |
], | |
"source": [ | |
"import re\n", | |
"import os\n", | |
"\n", | |
"from scipy.stats import pearsonr\n", | |
"from datetime import datetime\n", | |
"\n", | |
"from gensim.models import CoherenceModel\n", | |
"from gensim.corpora.dictionary import Dictionary\n", | |
"%load_ext line_profiler" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"prefix = \"/home/devashish/datasets/Movies/movie\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Time taken: 0:03:42.008070\n" | |
] | |
} | |
], | |
"source": [ | |
"start = datetime.now()\n", | |
"texts = []\n", | |
"for fil in os.listdir(prefix):\n", | |
" for line in open(prefix + '/' + fil):\n", | |
" # lower case all words\n", | |
" lowered = line.lower()\n", | |
" #remove punctuation and split into seperate words\n", | |
" words = re.findall(r'\\w+', lowered, flags = re.UNICODE | re.LOCALE)\n", | |
" texts.append(words)\n", | |
"end = datetime.now()\n", | |
"print \"Time taken: %s\" % (end - start)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Time taken: 0:01:46.778892\n" | |
] | |
} | |
], | |
"source": [ | |
"start = datetime.now()\n", | |
"dictionary = Dictionary(texts)\n", | |
"corpus = [dictionary.doc2bow(text) for text in texts]\n", | |
"end = datetime.now()\n", | |
"print \"Time taken: %s\" % (end - start)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"124234\n", | |
"Dictionary(758123 unique tokens: [u'schelberger', u'mdbg', u'shatzky', u'bhetan', u'verplank']...)\n" | |
] | |
} | |
], | |
"source": [ | |
"print len(corpus)\n", | |
"print dictionary" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[[]]" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"topics = [] # list of 100 topics\n", | |
"for l in open('/home/devashish/datasets/Movies/topicsMovie.txt'):\n", | |
" topics.append([l.split()])\n", | |
"topics.pop(100)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"human_scores = []\n", | |
"for l in open('/home/devashish/datasets/Movies/goldMovie.txt'):\n", | |
" human_scores.append(float(l.strip()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0\n", | |
"1\n", | |
"2\n", | |
"3\n", | |
"4\n", | |
"5\n", | |
"6\n", | |
"7\n", | |
"8\n", | |
"9\n", | |
"10\n", | |
"11\n", | |
"12\n", | |
"13\n", | |
"14\n", | |
"15\n", | |
"16\n", | |
"17\n", | |
"18\n", | |
"19\n", | |
"20\n", | |
"21\n", | |
"22\n", | |
"23\n", | |
"24\n", | |
"25\n", | |
"26\n", | |
"27\n", | |
"28\n", | |
"29\n", | |
"30\n", | |
"31\n", | |
"32\n", | |
"33\n", | |
"34\n", | |
"35\n", | |
"36\n", | |
"37\n", | |
"38\n", | |
"39\n", | |
"40\n", | |
"41\n", | |
"42\n", | |
"43\n", | |
"44\n", | |
"45\n", | |
"46\n", | |
"47\n", | |
"48\n", | |
"49\n", | |
"50\n", | |
"51\n", | |
"52\n", | |
"53\n", | |
"54\n", | |
"55\n", | |
"56\n", | |
"57\n", | |
"58\n", | |
"59\n", | |
"60\n", | |
"61\n", | |
"62\n", | |
"63\n", | |
"64\n", | |
"65\n", | |
"66\n", | |
"67\n", | |
"68\n", | |
"69\n", | |
"70\n", | |
"71\n", | |
"72\n", | |
"73\n", | |
"74\n", | |
"75\n", | |
"76\n", | |
"77\n", | |
"78\n", | |
"79\n", | |
"80\n", | |
"81\n", | |
"82\n", | |
"83\n", | |
"84\n", | |
"85\n", | |
"86\n", | |
"87\n", | |
"88\n", | |
"89\n", | |
"90\n", | |
"91\n", | |
"92\n", | |
"93\n", | |
"94\n", | |
"95\n", | |
"96\n", | |
"97\n", | |
"98\n", | |
"99\n", | |
"Time taken: 0:21:26.985034\n" | |
] | |
} | |
], | |
"source": [ | |
"start = datetime.now()\n", | |
"u_mass = []\n", | |
"flags = []\n", | |
"for n, topic in enumerate(topics):\n", | |
" print n # for personal monitoring purposes. sorry for this\n", | |
" try:\n", | |
" cm = CoherenceModel(topics=topic, corpus=corpus, dictionary=dictionary, coherence='u_mass')\n", | |
" u_mass.append(cm.get_coherence())\n", | |
" except KeyError:\n", | |
" flags.append(n)\n", | |
"end = datetime.now()\n", | |
"print \"Time taken: %s\" % (end - start)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0\n", | |
"1\n", | |
"2\n", | |
"3\n", | |
"4\n", | |
"5\n", | |
"6\n", | |
"7\n", | |
"8\n", | |
"9\n", | |
"10\n", | |
"11\n", | |
"12\n", | |
"13\n", | |
"14\n", | |
"15\n", | |
"16\n", | |
"17\n", | |
"18\n", | |
"19\n", | |
"20\n", | |
"21\n", | |
"22\n", | |
"23\n", | |
"24\n", | |
"25\n", | |
"26\n", | |
"27\n", | |
"28\n", | |
"29\n", | |
"30\n", | |
"31\n", | |
"32\n", | |
"33\n", | |
"34\n", | |
"35\n", | |
"36\n", | |
"37\n", | |
"38\n", | |
"39\n", | |
"40\n", | |
"41\n", | |
"42\n", | |
"43\n", | |
"44\n", | |
"45\n", | |
"46\n", | |
"47\n", | |
"48\n", | |
"49\n", | |
"50\n", | |
"51\n", | |
"52\n", | |
"53\n", | |
"54\n", | |
"55\n", | |
"56\n", | |
"57\n", | |
"58\n", | |
"59\n", | |
"60\n", | |
"61\n", | |
"62\n", | |
"63\n", | |
"64\n", | |
"65\n", | |
"66\n", | |
"67\n", | |
"68\n", | |
"69\n", | |
"70\n", | |
"71\n", | |
"72\n", | |
"73\n", | |
"74\n", | |
"75\n", | |
"76\n", | |
"77\n", | |
"78\n", | |
"79\n", | |
"80\n", | |
"81\n", | |
"82\n", | |
"83\n", | |
"84\n", | |
"85\n", | |
"86\n", | |
"87\n", | |
"88\n", | |
"89\n", | |
"90\n", | |
"91\n", | |
"92\n", | |
"93\n", | |
"94\n", | |
"95\n", | |
"96\n", | |
"97\n", | |
"98\n", | |
"99\n", | |
"Time taken: 0:26:56.513673\n" | |
] | |
} | |
], | |
"source": [ | |
"start = datetime.now()\n", | |
"c_v = []\n", | |
"for n, topic in enumerate(topics):\n", | |
" print n # for personal monitoring purposes. sorry for this\n", | |
" try:\n", | |
" cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_v')\n", | |
" c_v.append(cm.get_coherence())\n", | |
" except KeyError:\n", | |
" pass\n", | |
"end = datetime.now()\n", | |
"print \"Time taken: %s\" % (end - start)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"final_scores = []\n", | |
"for n, score in enumerate(human_scores):\n", | |
" if n not in flags:\n", | |
" final_scores.append(score)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"99 99 99\n" | |
] | |
} | |
], | |
"source": [ | |
"print len(u_mass), len(c_v), len(final_scores)\n", | |
"# 1 topic has word(s) that is not in the dictionary. Probably some difference\n", | |
"# in preprocessing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.133916622716\n", | |
"0.511388503209\n" | |
] | |
} | |
], | |
"source": [ | |
"print pearsonr(u_mass, final_scores)[0]\n", | |
"print pearsonr(c_v, final_scores)[0]" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment