Skip to content

Instantly share code, notes, and snippets.

@devashishd12
Last active July 12, 2016 07:19
Show Gist options
  • Save devashishd12/584c2cfd586f0a56c8f4a1dc38b067c3 to your computer and use it in GitHub Desktop.
Save devashishd12/584c2cfd586f0a56c8f4a1dc38b067c3 to your computer and use it in GitHub Desktop.
Benchmark testing for coherence measures in gensim
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import re\n",
"\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"from scipy.stats import pearsonr\n",
"from datetime import datetime\n",
"\n",
"from gensim.models import CoherenceModel\n",
"from gensim.corpora.dictionary import Dictionary"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dataset = fetch_20newsgroups(subset='all')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"documents = dataset['data'] # is a list of documents"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"texts = []\n",
"for document in documents:\n",
" # lower case all words\n",
" lowered = document.lower()\n",
" #remove punctuation and split into seperate words\n",
" words = re.findall(r'\\w+', lowered, flags = re.UNICODE | re.LOCALE)\n",
" texts.append(words)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dictionary = Dictionary(texts)\n",
"corpus = [dictionary.doc2bow(text) for text in texts]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"18846\n",
"Dictionary(173771 unique tokens: [u'3ds2scn', u'25599', u'diagnositic', u'9l2t', u'l1tbk']...)\n"
]
}
],
"source": [
"print len(documents)\n",
"print dictionary"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"topics = [] # list of 100 topics\n",
"for l in open('/home/devashish/datasets/20NG/topics20NG.txt'):\n",
" topics.append([l.split()])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"human_scores = []\n",
"for l in open('/home/devashish/datasets/20NG/gold20NG.txt'):\n",
" human_scores.append(float(l.strip()))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"1\n",
"2\n",
"3\n",
"4\n",
"5\n",
"6\n",
"7\n",
"8\n",
"9\n",
"10\n",
"11\n",
"12\n",
"13\n",
"14\n",
"15\n",
"16\n",
"17\n",
"18\n",
"19\n",
"20\n",
"21\n",
"22\n",
"23\n",
"24\n",
"25\n",
"26\n",
"27\n",
"28\n",
"29\n",
"30\n",
"31\n",
"32\n",
"33\n",
"34\n",
"35\n",
"36\n",
"37\n",
"38\n",
"39\n",
"40\n",
"41\n",
"42\n",
"43\n",
"44\n",
"45\n",
"46\n",
"47\n",
"48\n",
"49\n",
"50\n",
"51\n",
"52\n",
"53\n",
"54\n",
"55\n",
"56\n",
"57\n",
"58\n",
"59\n",
"60\n",
"61\n",
"62\n",
"63\n",
"64\n",
"65\n",
"66\n",
"67\n",
"68\n",
"69\n",
"70\n",
"71\n",
"72\n",
"73\n",
"74\n",
"75\n",
"76\n",
"77\n",
"78\n",
"79\n",
"80\n",
"81\n",
"82\n",
"83\n",
"84\n",
"85\n",
"86\n",
"87\n",
"88\n",
"89\n",
"90\n",
"91\n",
"92\n",
"93\n",
"94\n",
"95\n",
"96\n",
"97\n",
"98\n",
"99\n",
"Time taken: 0:06:30.513886\n"
]
}
],
"source": [
"start = datetime.now()\n",
"u_mass = []\n",
"flags = []\n",
"for n, topic in enumerate(topics):\n",
" print n # for personal monitoring purposes. sorry for this\n",
" try:\n",
" cm = CoherenceModel(topics=topic, corpus=corpus, dictionary=dictionary, coherence='u_mass')\n",
" u_mass.append(cm.get_coherence())\n",
" except KeyError:\n",
" flags.append(n)\n",
"end = datetime.now()\n",
"print \"Time taken: %s\" % (end - start)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"1\n",
"2\n",
"3\n",
"4\n",
"5\n",
"6\n",
"7\n",
"8\n",
"9\n",
"10\n",
"11\n",
"12\n",
"13\n",
"14\n",
"15\n",
"16\n",
"17\n",
"18\n",
"19\n",
"20\n",
"21\n",
"22\n",
"23\n",
"24\n",
"25\n",
"26\n",
"27\n",
"28\n",
"29\n",
"30\n",
"31\n",
"32\n",
"33\n",
"34\n",
"35\n",
"36\n",
"37\n",
"38\n",
"39\n",
"40\n",
"41\n",
"42\n",
"43\n",
"44\n",
"45\n",
"46\n",
"47\n",
"48\n",
"49\n",
"50\n",
"51\n",
"52\n",
"53\n",
"54\n",
"55\n",
"56\n",
"57\n",
"58\n",
"59\n",
"60\n",
"61\n",
"62\n",
"63\n",
"64\n",
"65\n",
"66\n",
"67\n",
"68\n",
"69\n",
"70\n",
"71\n",
"72\n",
"73\n",
"74\n",
"75\n",
"76\n",
"77\n",
"78\n",
"79\n",
"80\n",
"81\n",
"82\n",
"83\n",
"84\n",
"85\n",
"86\n",
"87\n",
"88\n",
"89\n",
"90\n",
"91\n",
"92\n",
"93\n",
"94\n",
"95\n",
"96\n",
"97\n",
"98\n",
"99\n",
"Time taken: 1:29:53.612739\n"
]
}
],
"source": [
"start = datetime.now()\n",
"c_v = []\n",
"for n, topic in enumerate(topics):\n",
" print n # for personal monitoring purposes. sorry for this\n",
" try:\n",
" cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_v')\n",
" c_v.append(cm.get_coherence())\n",
" except KeyError:\n",
" pass\n",
"end = datetime.now()\n",
"print \"Time taken: %s\" % (end - start)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"final_scores = []\n",
"for n, score in enumerate(human_scores):\n",
" if n not in flags:\n",
" final_scores.append(score)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"99 99 99\n"
]
}
],
"source": [
"print len(u_mass), len(c_v), len(final_scores)\n",
"# 1 topic has word(s) that is not in the dictionary. Probably some difference\n",
"# in preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.465570184541\n",
"0.45335681789\n"
]
}
],
"source": [
"print pearsonr(u_mass, final_scores)[0]\n",
"print pearsonr(c_v, final_scores)[0]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment