Skip to content

Instantly share code, notes, and snippets.

@devashishd12
Created July 14, 2016 11:53
Show Gist options
  • Save devashishd12/22c77be7ee5b10c54d53560951e33fbe to your computer and use it in GitHub Desktop.
Save devashishd12/22c77be7ee5b10c54d53560951e33fbe to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The line_profiler extension is already loaded. To reload it, use:\n",
" %reload_ext line_profiler\n"
]
}
],
"source": [
"import re\n",
"import os\n",
"\n",
"from scipy.stats import pearsonr\n",
"from datetime import datetime\n",
"\n",
"from gensim.models import CoherenceModel\n",
"from gensim.corpora.dictionary import Dictionary\n",
"%load_ext line_profiler"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"prefix = \"/home/devashish/datasets/Movies/movie\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time taken: 0:03:42.008070\n"
]
}
],
"source": [
"start = datetime.now()\n",
"texts = []\n",
"for fil in os.listdir(prefix):\n",
" for line in open(prefix + '/' + fil):\n",
" # lower case all words\n",
" lowered = line.lower()\n",
" #remove punctuation and split into seperate words\n",
" words = re.findall(r'\\w+', lowered, flags = re.UNICODE | re.LOCALE)\n",
" texts.append(words)\n",
"end = datetime.now()\n",
"print \"Time taken: %s\" % (end - start)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time taken: 0:01:46.778892\n"
]
}
],
"source": [
"start = datetime.now()\n",
"dictionary = Dictionary(texts)\n",
"corpus = [dictionary.doc2bow(text) for text in texts]\n",
"end = datetime.now()\n",
"print \"Time taken: %s\" % (end - start)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"124234\n",
"Dictionary(758123 unique tokens: [u'schelberger', u'mdbg', u'shatzky', u'bhetan', u'verplank']...)\n"
]
}
],
"source": [
"print len(corpus)\n",
"print dictionary"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[[]]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"topics = [] # list of 100 topics\n",
"for l in open('/home/devashish/datasets/Movies/topicsMovie.txt'):\n",
" topics.append([l.split()])\n",
"topics.pop(100)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"human_scores = []\n",
"for l in open('/home/devashish/datasets/Movies/goldMovie.txt'):\n",
" human_scores.append(float(l.strip()))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"1\n",
"2\n",
"3\n",
"4\n",
"5\n",
"6\n",
"7\n",
"8\n",
"9\n",
"10\n",
"11\n",
"12\n",
"13\n",
"14\n",
"15\n",
"16\n",
"17\n",
"18\n",
"19\n",
"20\n",
"21\n",
"22\n",
"23\n",
"24\n",
"25\n",
"26\n",
"27\n",
"28\n",
"29\n",
"30\n",
"31\n",
"32\n",
"33\n",
"34\n",
"35\n",
"36\n",
"37\n",
"38\n",
"39\n",
"40\n",
"41\n",
"42\n",
"43\n",
"44\n",
"45\n",
"46\n",
"47\n",
"48\n",
"49\n",
"50\n",
"51\n",
"52\n",
"53\n",
"54\n",
"55\n",
"56\n",
"57\n",
"58\n",
"59\n",
"60\n",
"61\n",
"62\n",
"63\n",
"64\n",
"65\n",
"66\n",
"67\n",
"68\n",
"69\n",
"70\n",
"71\n",
"72\n",
"73\n",
"74\n",
"75\n",
"76\n",
"77\n",
"78\n",
"79\n",
"80\n",
"81\n",
"82\n",
"83\n",
"84\n",
"85\n",
"86\n",
"87\n",
"88\n",
"89\n",
"90\n",
"91\n",
"92\n",
"93\n",
"94\n",
"95\n",
"96\n",
"97\n",
"98\n",
"99\n",
"Time taken: 0:21:26.985034\n"
]
}
],
"source": [
"start = datetime.now()\n",
"u_mass = []\n",
"flags = []\n",
"for n, topic in enumerate(topics):\n",
" print n # for personal monitoring purposes. sorry for this\n",
" try:\n",
" cm = CoherenceModel(topics=topic, corpus=corpus, dictionary=dictionary, coherence='u_mass')\n",
" u_mass.append(cm.get_coherence())\n",
" except KeyError:\n",
" flags.append(n)\n",
"end = datetime.now()\n",
"print \"Time taken: %s\" % (end - start)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"1\n",
"2\n",
"3\n",
"4\n",
"5\n",
"6\n",
"7\n",
"8\n",
"9\n",
"10\n",
"11\n",
"12\n",
"13\n",
"14\n",
"15\n",
"16\n",
"17\n",
"18\n",
"19\n",
"20\n",
"21\n",
"22\n",
"23\n",
"24\n",
"25\n",
"26\n",
"27\n",
"28\n",
"29\n",
"30\n",
"31\n",
"32\n",
"33\n",
"34\n",
"35\n",
"36\n",
"37\n",
"38\n",
"39\n",
"40\n",
"41\n",
"42\n",
"43\n",
"44\n",
"45\n",
"46\n",
"47\n",
"48\n",
"49\n",
"50\n",
"51\n",
"52\n",
"53\n",
"54\n",
"55\n",
"56\n",
"57\n",
"58\n",
"59\n",
"60\n",
"61\n",
"62\n",
"63\n",
"64\n",
"65\n",
"66\n",
"67\n",
"68\n",
"69\n",
"70\n",
"71\n",
"72\n",
"73\n",
"74\n",
"75\n",
"76\n",
"77\n",
"78\n",
"79\n",
"80\n",
"81\n",
"82\n",
"83\n",
"84\n",
"85\n",
"86\n",
"87\n",
"88\n",
"89\n",
"90\n",
"91\n",
"92\n",
"93\n",
"94\n",
"95\n",
"96\n",
"97\n",
"98\n",
"99\n",
"Time taken: 0:26:56.513673\n"
]
}
],
"source": [
"start = datetime.now()\n",
"c_v = []\n",
"for n, topic in enumerate(topics):\n",
" print n # for personal monitoring purposes. sorry for this\n",
" try:\n",
" cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_v')\n",
" c_v.append(cm.get_coherence())\n",
" except KeyError:\n",
" pass\n",
"end = datetime.now()\n",
"print \"Time taken: %s\" % (end - start)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"final_scores = []\n",
"for n, score in enumerate(human_scores):\n",
" if n not in flags:\n",
" final_scores.append(score)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"99 99 99\n"
]
}
],
"source": [
"print len(u_mass), len(c_v), len(final_scores)\n",
"# 1 topic has word(s) that is not in the dictionary. Probably some difference\n",
"# in preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.133916622716\n",
"0.511388503209\n"
]
}
],
"source": [
"print pearsonr(u_mass, final_scores)[0]\n",
"print pearsonr(c_v, final_scores)[0]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment