Skip to content

Instantly share code, notes, and snippets.

@devashishd12
Created August 21, 2016 12:09
Show Gist options
  • Save devashishd12/ac9d3bf57579d02302f9655db8dfdd55 to your computer and use it in GitHub Desktop.
Save devashishd12/ac9d3bf57579d02302f9655db8dfdd55 to your computer and use it in GitHub Desktop.
Notebook for topic coherence use cases blog
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Some topic coherence use cases"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import operator\n",
"from collections import namedtuple\n",
"\n",
"from gensim.models import LdaModel\n",
"from gensim.models import CoherenceModel\n",
"from gensim.corpora import Dictionary\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"texts = [['human', 'interface', 'computer'],\n",
" ['survey', 'user', 'computer', 'system', 'response', 'time'],\n",
" ['eps', 'user', 'interface', 'system'],\n",
" ['system', 'human', 'system', 'eps'],\n",
" ['user', 'response', 'time'],\n",
" ['trees'],\n",
" ['graph', 'trees'],\n",
" ['graph', 'minors', 'trees'],\n",
" ['graph', 'minors', 'survey']]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dictionary = Dictionary(texts)\n",
"corpus = [dictionary.doc2bow(text) for text in texts]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Finding out the optimal number of topics"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def evaluate_graph(dictionary, corpus, texts, limit):\n",
" \"\"\"\n",
" Function to display num_topics - LDA graph using c_v coherence\n",
" \n",
" Parameters:\n",
" ----------\n",
" dictionary : Gensim dictionary\n",
" corpus : Gensim corpus\n",
" limit : topic limit\n",
" \n",
" Returns:\n",
" -------\n",
" lm_list : List of LDA topic models\n",
" \"\"\"\n",
" c_v = []\n",
" lm_list = []\n",
" for num_topics in range(1, limit):\n",
" lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)\n",
" lm_list.append(lm)\n",
" cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')\n",
" c_v.append(cm.get_coherence())\n",
" \n",
" # Show graph\n",
" x = range(1, limit)\n",
" plt.plot(x, c_v)\n",
" plt.xlabel(\"num_topics\")\n",
" plt.ylabel(\"Coherence score\")\n",
" plt.legend((\"c_v\"), loc='best')\n",
" plt.show()\n",
" \n",
" return lm_list"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAEQCAYAAABFtIg2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XecVOX1x/HPAQUlSERFbICKUUFjQQXsq2gkERFjQ2NM\nYhKJUTEaC8YYsMSo0ViTKFGxgQbBArafWDaISkkoooAgWChqMBaiICB7fn88d8MAM+zs7p25d2a+\n79drXzvlztyzsDvnPu085u6IiIhk0yTpAEREJL2UJEREJCclCRERyUlJQkREclKSEBGRnJQkREQk\np4InCTPraWazzGy2mV2ynuP2M7OVZvb9+r5WREQKwwq5TsLMmgCzgR7AImAS0NfdZ2U5bgywDLjH\n3R/N97UiIlI4hW5JdAXmuPt77r4SeBg4Nstx5wIjgH834LUiIlIghU4S2wLzM+4viB77HzPbBujj\n7n8FrD6vFRGRwkrDwPXNgMYbRERSaIMCv/9CoH3G/e2ixzLtCzxsZgZsAXzXzL7O87UAmJkKUImI\n1JO7Wz4HFewLaAq8DXQAmgFTgU7rOX4I8P36vjb8GOk2cODApEPIi+KMl+KMl+KMT/S5WefneEFb\nEu6+yszOAZ4jdG3d7e4zzaxfFODgtV9S12sLGa+IiKyp0N1NuPuzwC5rPXZnjmPPqOu1IiJSPGkY\nuK4IVVVVSYeQF8UZL8UZL8VZfAVdTFcsZubl8HOIiBSLmeU1cK2WhIhIDttvvz1mVtJf22+/faP+\nDdSSEBHJIbraTjqMRsn1M6glISIijaYkISIiOSlJiIhITkoSIiKSk5KEiIjkpCQhIiI5KUmIiJSo\nBQsWcPzxx7PlllvSpk0b+vfvH/s5lCREREpQTU0NvXr1YocdduD9999n4cKF9O3bN/bzaDGdiEgO\ndS2ms7p3Y8hLQz6+xo8fz7HHHssHH3xAkya5r/cbu5iu4FVgRUTKVZLXpvPnz6dDhw7rTRBxUHeT\niEgJateuHe+//z41NTUFPY+ShIhICeratStbb701AwYMYOnSpSxfvpxXX3019vMoSYiIlKAmTZow\nevRo5syZQ/v27WnXrh3Dhw+P/TwauBYRyUFVYNWSEBGR9VCSEBGRnMomSSxenHQEIiLlp2ySxB/+\nkHQEIiLlp2wGrjfbzJkyBdq3TzoaESkXGrguo5bEWWfBoEFJRyEiUl7KpizHRRfBt74FM2ZA585J\nRyMi5aBDhw5YXAWaEtKhQ4dGvb5supvcnRtugFdfhUcfTToiEZF0y7e7qaySxLJlsPPOMGIEdOuW\ndFQiIulVcWMSABtvDAMHwoAByVZnFBEpFwVPEmbW08xmmdlsM7sky/O9zWyamU0xs4lmdmDGc+eZ\n2fToK68tl378Y1i0CMaMifGHEBGpUAXtbjKzJsBsoAewCJgE9HX3WRnHtHD3pdHtbwPD3b2Tme0G\nPATsB3wNPAP8wt3nZTnPGrWbRowI6yYmTYICl1oXESlJaelu6grMcff33H0l8DBwbOYBtQki0hKo\nLY7eCZjg7svdfRUwFvh+Pic9/viwY9SIEY2OX0SkohU6SWwLzM+4vyB6bA1m1sfMZgKjgTOih98A\nDjaz1mbWAvge0C6fk5rBtdfCb38LK1c2Kn4RkYqWis4Yd3/c3TsBfYCro8dmAdcBY4CngSnAqnzf\n84gjwurrIUMKELCISIUo9GK6hUBmoYztoseycvdxZrajmW3m7p+4+xBgCICZ/Z41WyVrGJSx3Lqq\nqoqqqir+8Ac47jg47TRo0aKRP4mISAmrrq6murq63q8r9MB1U+AtwsD1B8BE4BR3n5lxTEd3nxvd\n7gI84e7tovtt3H2xmbUHngW6u/uSLOfJuenQCSdA165w8cUx/3AiIiUsNYvpzKwncAuha+tud7/W\nzPoB7u6Dzexi4HRgBbAMuNDdX4teOxbYDFgJnO/u1TnOkTNJzJoFBx8Ms2dD69Yx/3AiIiUqNUmi\nGOravvRnP4Mtt4RrriliUCIiKaYkkWH+fNhrL3jjDdh66yIGJiKSUkoSa7nwQli2DP785yIFJSKS\nYkoSa/n4Y9h1V5gwATp2LFJgIiIplZYV16mxxRZw3nnwu98lHYmISOmomJYEwBdfwE47wbPPhjEK\nEZFKpZZEFi1bwmWXhS8REalbRSUJgDPPDFucjh2bdCQiIulXcUmieXO48kq49FJtTCQiUpeKSxIA\np54KS5bAk08mHYmISLpVZJJo2jSsvv7Nb2BV3nVlRUQqT0UmCYBevaBVKxg2LOlIRETSq6KmwK7t\n5Zfh9NNDEcDmzQsQmIhISmkKbB4OPhg6d4bBg5OOREQknSq6JQEwbRocdRTMmQObbBJzYCIiKaWW\nRJ723BN69ICbb046EhGR9Kn4lgTA3LnQrVsYm9hiixgDExFJKVWBraezz4aNNoIbb4wpKBGRFFOS\nqKcPPoDdd4epU6Fdu5gCExFJKSWJBrjsMvjoI7jrrhiCEhFJMSWJBvjsM9h551D8b9ddYwhMRCSl\nNLupATbdFC66CH7726QjERFJB7Uk1rJsGXzrW/DYY7DffrG8pYhI6qgl0UAbbxy2OL300qQjERFJ\nnpJEFj/5Cbz/Pjz/fNKRiIgkS0kiiw03hKuvhgEDtDGRiFQ2JYkcTjghJIiRI5OOREQkORq4Xo/n\nnoNzz4U334QNNoj97UVEEqOB6xgceSRsuy3ce2/SkYiIJEMtiTpMmBC6nmbPDjOfRETKQWpaEmbW\n08xmmdlsM7sky/O9zWyamU0xs4lmdmDGc+eb2Rtm9rqZDTWzZoWOd23duoX1En/+c7HPLCKSvLxa\nEma2MdDe3d+q15ubNQFmAz2ARcAkoK+7z8o4poW7L41ufxsY7u6dzGwbYBywq7uvMLO/A0+5+/1Z\nzlOwlgTAjBlQVRVaE5tuWrDTiIgUTWwtCTM7BpgKPBvd38vMRuUZR1dgjru/5+4rgYeBYzMPqE0Q\nkZZATcb9psA3zGwDoAUh0RRd587QqxfccEMSZxeRZcvg8MPhj3+EVauSjqay5NPdNIjwYf8ZgLtP\nBXbI8/23BeZn3F8QPbYGM+tjZjOB0cAZ0XkWATcC7wMLgc/cPbHlbYMGwV//Ch9+mFQEUuuNN+CB\nB5KOQorp0kvhG9+AZ56Bgw4KG4RJceQzsXOlu39utkarJNa+HXd/HHjczA4CrgaONLNNCa2ODsDn\nwAgzO9Xdh2V7j0GDBv3vdlVVFVVVVXGGSPv28KMfhUV2t98e61tLPV1/PTz0EDRpAj/4QdLRSKG9\n+CI88gi8/jq0bg133BESxSWXwAUXQNOmSUdYGqqrq6murq736+ockzCzu4EXgAHA8UB/YEN3/0Wd\nb27WHRjk7j2j+wMAd/fr1vOaucB+wOHAUe7+8+jxHwLd3P2cLK8p6JhErcWLoVMnmDgRdtyx4KeT\nLJYuhW22CYscTz0Vhg0Le5RLefrss7AP/R13wHe/u/rxd96BM86Ar76CIUNU2r8h4pzddC6wG7Ac\nGEa4qv9VnnFMAnYysw7RzKS+wBrjGWbWMeN2F6CZu39C6GbqbmYbWWjG9ABm5nnegmjTBvr3DwUA\nJRmjRkH37iExPPIInHIKTJuWdFRSKOedB9/73poJAmCHHeCFF+C000Kr4oYbNFZRMO6e84swcHzD\n+o6p6wvoCbwFzAEGRI/1A86Mbl8MvAFMBl4B9s947UBCYngduI/Qgsl2Di+WJUvc27Z1nzataKeU\nDEcf7X7//avvDx/uvu227u++m1xMUhgjR7rvtJP7F1+s/7i5c92rqty7d3efObM4sZWD6HOzzs/w\nfLqbxrt791gzU8yK1d1U69ZbYcwYGD26aKcU4OOPoWNHWLAANtlk9eO33BK6I155BTbbLLn4JD4f\nfgh77RX2ddl//7qPr6kJE0sGDgyFOc8/X2MVdYlt+1Iz+ythRtIjwJe1j7v7o40NMi7FThLLl8Mu\nu8CDD4amrhTHX/4CL78cBq3XdtFF8NprIXlrZXxpc4fevWGPPeD3v6/fa+fNC2MVK1aEsYpddilM\njOUgzjGJjYD/EAaSj4m+ejUuvNLWvDlceaVKiRfb0KGhDzqb664LM9BOO01906Xu7rtDa3HgwPq/\ndscdw2yoH/wgXMDdeKN+HxpLtZsaaNWqMOviuuvg6KOLeuqKNG9eGLBeuDDs95HN8uVhkLNTJ7jt\nNrA6r5EkbebNg65doboadt+98e+lVkVuca643s7MHjOzf0dfI81su3jCLF1Nm8I114RFPjU1dR8v\njTNsGJx4Yu4EAaGF9+ijoUvq+uuLF5vEY9WqsBbp0ksbnyBgdavi1FPhwAPVqmiofLqbhhCmrW4T\nfY2OHqt4xxwDLVtm7yOX+LiHrqZ8Fs5985vw9NNh/EKrskvLjTeGi6/zz4/vPZs0gXPOCdWcR4+G\nQw6Bt+pVgU7ySRJt3H2Iu38dfd0LtClwXCXBDP7wB7j88tCklcKYMiV0JeUzywXCHiDPPAMXXhgG\nsiX9Xn891GW6997wwR63jh1Dq+KUU9SqqK98/jv+Y2anmVnT6Os0wkC2AIceGvo6//a3pCMpX7Wt\niPqMMXTuDCNGhNdNmVK42KTxli+HH/4wdBFuv33hzpOtVTF7duHOVy7ymQLbAbgN2J9Qs+lVoL+7\nv1/48PKTxMB1pilTwoDpnDmh+0nis2oVtGsXrgIbUnph5MiwSv6VVwr7ASQNN2AAzJwJjz9evMkG\nNTWhS/KKK8IYyHnnVd66itjWSZSCpJMEhGbs7rvDZZclGkbZef75UMjtX/9q+HvcdlvYNOqVV2Dz\nzeOLTRpv3LgwIWHaNNhyy+Kff+7cMAPq66/DDKiddy5+DEmJc3bTfVFF1tr7rc3snsYGWG6uugpu\nugn+o464WOU7YL0+554Lxx4bJhosWxZPXNJ4//1vmM10xx3JJAgIYxUvvQR9+4axiptu0ljF2vLp\nbpri7nvX9ViS0tCSADjrrNDd9Mc/Jh1JeVi2LFR8ffPN8L0xamrg9NPhyy/DWEWldS2k0ZlnwsqV\n4Qo+DWpbFatWwT33lH+rIs4V103MrHXGG29GfvtQVJzLLw+/XAsWJB1JeRg9Gvbdt/EJAsKg5T33\nhKvX/v21Uj5pTz0Fzz0X6m6lRW2r4uST4YAD1KqolU+SuBF4zcyuMrOrCQPXWqqUxTbbhKujK65I\nOpLyEEdXU6ZmzcJiu1degWuvje99pX4+/hh+/nO47z5o1SrpaNbUpEnonpwwIQykH3pomJBSyfIa\nuDazzoTaTQAvuvuMgkZVT2npbgL49NPQTH35ZW2E0hiffBL2DJg/P/4PkkWLwpXiFVeEPnEpHvcw\nUN2hQ1irkGY1NWHCw5VXwm9+E1qg5dRNGefAdUdgrrvfTtj34YjMgWxZU+vWYRHX5ZcnHUlpe+QR\n6NmzMFea22wTFttdfHHo8pDiGTo07E9d3+quSahtVYwfH0qWV2qrIp/uppHAKjPbCbgTaEfYoU5y\nOPdcePVVmDQp6UhKV9xdTWvr1Cl0PZ12GkyeXLjzyGrz54eSGw88ABttlHQ0+evYMRQcPOmk0AK9\n+ebKqteWz+ymye7excwuBpa5+22a3VS3O+8Ms2hUFqL+3nsP9tkndAs1a1bYcz32WFiFO25c6N6S\nwqipgSOPhMMPL+21RG+/HWZAuYeJEN/6VtIRNVycs5tWmtkpwOnAk9Fj66nFKRB+kd59N+zDK/Uz\nbBiccELhEwTAcceF/uaePcOAqhTG7bfD0qVhYWQp22mn0Ko48cTKaVXk05LoDPwCeM3dHzKzHYCT\n3P26YgSYjzS2JAD+/vcwODdhgvY2yJc7fPvbYYFVMXf9u/TS8Mf/wgvQokXxzlsJZs6Egw8OOweW\n8pX32jJbFUOGhARSSmJrSbj7DHfv7+4PRfffSVOCSLMTTwzL/R97LOlISsfrr8MXX4SrtGK65prw\nAXbKKeH/TOKxcmVYxHjVVeWVIGB1q+KEE0KF4ltuKc9WhWo3Fdj//R/86lcwfTpsoCWIdbroorCx\n0DXXFP/cK1ZAr15hs5q//lWtvzgMGhRmBz3zTHn/e5ZiqyLOMQlphO98B7baCu6/P+lI0m/VqrCB\nUyFnNa1Ps2ahauzEickkqXIzcWJItvfcU94JAtZtVTSmIGXa5J0kzEw9tQ1QuzHRoEHamKguY8fC\nFlvAbrslF8Mmm4SSEXfdFTbAkYZZujR0M916azxlVUpBkyah5Pgdd4QJER9+mHRE8chnMd0BZjYD\nmBXd39PM/lLwyMpI9+5h9fXQoUlHkm5Dh4Z1C0nbeuvQPTJgADz7bNLRlKYBA6BLl1AHqdIcfzz8\n9Kfw/e+HDZVKXT6zmyYAJwCjatdGmNkb7h7DVuXxSPOYRK0XXgjz8d98szDbM5a6r74KV5zTp4ft\nR9Pg1VehT5+QMPbZJ+loSseYMaF/fto02GyzpKNJRk1NWHy3ySbp7W6LdUzC3eev9ZBqI9bT4YeH\nMuKjRiUdSTo99RTstVd6EgSEGVaDB4d9KObNSzqa0vDppyFB3H135SYICBeC990Xdq1MU6Xbhsgn\nScw3swMAN7MNzexCYGaB4yo7ZmEh0XXXqUx1NoUuw9FQffqEOlw9e8LixUlHk361Gzx95ztJR5K8\nb3wDnngi/M2Xco2wfLqbtgBuAY4ADHgOOM/dU7MHWyl0N0GYvdOpUxgUPeSQpKNJj08/DftPv/ce\nbJrS0pGXXRa2Un3xxfDHL+t65JHw7zRliv6NMr38cpj1NG5cutaKxLmY7mN3/4G7t3X3Ld39tPok\nCDPraWazzGy2ma2zKN/MepvZNDObYmYTzezA6PGdo8cmR98/N7P++Z43jZo2DesAtJfBmkaODHV9\n0pogAK6+Okw+6NtXi+2y+eCDMOb2wANKEGs7+ODw+9O7N3z+edLR1F8+LYn7CC2Hz6L7rYEb3f2M\nOt/crAkwG+gBLAImAX3dfVbGMS3cfWl0+9vAcHfvlOV9FgDdsoyPlExLAsJshx13DIOhe+yRdDTp\nUFUVpg4ed1zSkazfypVhfKJ9+1DAMY2DkUlwh6OPDrsIXnll0tGk17nnhrGtUaPSsS9FnAPXe9Qm\nCAB3/xTItwJsV2COu7/n7iuBh4FjMw+oTRCRlkC2he1HEPa0WCdBlJrmzcMH4nUqbAKE8tHTp8P3\nvpd0JHXbcMPQpfKvf4UrQwkGD4aPPtIeKnX505/CLL7f/CbpSOonn0IRTcysdZQc6rvH9bZA5gf7\nAkLiWIOZ9QH+ALQBjs7yPicDD+V5ztT7xS9Ca+Kdd1Se+qGHwrzy5s2TjiQ/tYvtDjggzMQ6o872\ndHl7++0wDjF2bEiiktuGG8Lw4dC1ayhimYY1QfnI58O+do/rRwgD1ycAse4r5e6PA4+b2UHA1cCR\ntc+Z2YZAb2DA+t5j0KBB/7tdVVVFVVVVnCHGqlWrsMfvjTeGEsqVbOjQsCq3lGy1VeguPPTQcLsU\nWkGFsGpV2P71t7+Fzp2TjqY0bL556G467LCwzXHXdS6ZC6e6uprq6up6vy7fPa53Aw6L7ua9x7WZ\ndQcGuXvP6P4AwNdXRdbM5gL7ufsn0f3ewC9r3yPHa0pmTKLWhx+GP6xZs2DLLZOOJhnTp4e+7Hff\nLc0FhuPHhzGKp5+G/fZLOpriu/baMLXz+edL8/8vSaNHw1lnhfpWSZUtibvA3yzgUWAU8IWZtc/z\ndZOAncysg5k1A/pG75EZaMeM212AZrUJInIKZdTVVGurrcKKzNtuSzqS5AwdGkpzl+oHTPfuYdHY\nscfC3LlJR1NcU6eGlvC995bu/1+SjjkGzj47rMNZtizpaNYvn9lN5wIDgY8IK62N0BrIa26OmfUk\nrLNoAtzt7teaWb/oPQZH26KeDqwAlgEXuvtr0WtbAO8BO7r7f9dzjpJrSUDoz91//zDjYZNNko6m\nuGpqwtqIp54K/bOl7M474YYb4JVXKqNV+NVXoeV04YWhu0kaxh1OPTVsIXD//cWfLZdvSyKfJPE2\nYeppahbPra1UkwSEAmjdusEFFyQdSXGNHRvm1b/+etKRxOPyy8PeIS+9VP7rBC6+OFzgjBypacCN\ntXRpWFh78slhDVUxxZkkXgKOdPfULiEq5SQxefLq7opi7OmcFmeeCR07lv6ex7Xcw0ynf/87lGIo\n1w2mxo4NCwqnTYM2bZKOpjwsWBAuFP/2t+JOgogzSdwN7AI8Bfyv8K27/6mxQcallJMEhDo3p5wC\nP/lJ0pEUx/LlYbBuypSwMK1crFwZVtVut11YO1BuV9lLlsCee4bZaMcck3Q05eW118LF4j/+EUr3\nFEOcA9fvA2OAZsAmGV8Sk0sugeuvL8/9cbN55hnYfffyShCwerHdlCnlufL4ggugRw8liELYf3/4\n4x/DRcannyYdzZrqbBS7+xWwZvkMiVdmGfE+fZKOpvDSsrlQIbRsueZiu5/9LOmI4jFqVChuOG1a\n0pGUrx/9KPz7nnxymFadli7LfLqb9gfuBlq6e3sz2xPo5+6/LEaA+Sj17iaAESPCDJnXXiu/bopM\nn38eWhDvvgutWycdTeHMmRMGJM8+O6wo33XX0v1/Xbw41BkbPjwUq5PC+frrsHaoc2e46abCnivO\n7qabgaOA/wC4+zRAha5jdtxx8MknYWCwnI0cGVpO5ZwgIJSEHjMGFi2Co44Kg/TnnhtmQH31VdLR\n5c8d+vWDH/5QCaIYNtgAHn44tEbvuSfpaALtTJcStWXEy73wX1o3FyqE3XeHv/wl7JPxxBNhsP6q\nq6Bt29Ct+Le/hSSSZvffH1pFV12VdCSVo3Xr0L03YEBYe5O0fLqbRgB/Am4HugHnAfu6e9/Ch5ef\ncuhugjDrZ4cdwsDunnsmHU38Fi4MC+cWLYKNNko6muT85z/w7LPw5JOhZbH99tCrV+hm2G+/9Kxg\nfu+9UP77+efL8/cx7Z55Bn7601D+pRCTPOKcAqud6Yro+uvD4NXQoUlHEr8bb4QZM0IpCwm+/jqM\nQz35ZOhiWLwYvvvdkDSOPBK++c1k4qqpgSOOCNOzB6y3tKYU0g03wLBhYVe7Fi3ife9YkoSZNQX6\nu3uBh1Aap5ySxJIloYz4pEnlV0Z8771Dojj88KQjSa933w3J4sknQ1fDvvuGhNGrV6gaWiw33RQm\nU4wdm44NciqVe5j1tHx5GKuIc/JDnC2JSe6e6hqX5ZQkAC69FP773/IqIz5jRrgyfv99fejk68sv\n4YUXVieNFi1Wd0sdckjhVujPmBHef8KEMOAuyfrqq1CWvnfvsHdHXOJMEjcBGwJ/B76sfdzdJzc2\nyLiUW5IoxzLil10GK1aEBUNSf+6hG7K2W2rmzLCwrVev0D211VbxnGfFirCwq1+/UDpF0mHRolC6\n4/bbw8rsOMRdu2lt7u6p6TQotyQBYfe6Nm3KY1ZJTU24In3sMdhrr6SjKQ+LF4eBzaeeCns67LTT\n6lZGly4NH/y+/PJQT+zJJ0t3XUe5mjQp1HZ68cV4KifHliRKQTkmibffDvsVvPNO6ZcRHzcuXJm+\n8YY+eAph5cowflHbyvjss/Bh0qtXGHzO9/dn/PhwlTp1Kmy9dWFjloYZOjQk8okTYYstGvdecbYk\n2gLXANu4+3fNrDOwv7unZo5KOSYJKJ8y4medBe3ald4G8KVq7tzV4xjjx4eLjaOPDkkj1xjDl1+G\niQXXXAMnnFDceKV+BgwI40XPPde4fcXjTBLPAEOAy9x9TzPbAJji7qnZKqZck8TkyWGwat680i0j\nvmJFqGE0aVJYDyDF9d//hnUOTz0V6gG1arW6W+qgg1Z/yJx9diiZ8uCDycYrdVu1KrT4OnSAP/+5\n4e8T++wmM5vi7ntHj01199T0LpdrkoDSLyM+enRY+/Hyy0lHIjU1oUJtbStjzpww46xz51AC4vXX\nYdNNk45S8rFkSWgh9u8fxi8bIs7aTV+a2eaAR2/cHfi8YWFJfQ0YEEp1lGoZ8Uoqw5F2TZrAPvvA\n734X+rRnzgwzo95+OyzYUoIoHa1ahdIdAweGPSgKKZ+WRBfgNmB34A2gDXCCu6dm48lybkm4Q9eu\nYQppqZURX7IkjEXMmwebb550NCLl5/nnQ9n9116r/+LbWGc3ReMQuxDKcrzl7ivrF05hlXOSgFA5\n9Y9/LL0y4vfdF2IfNSrpSETK1623wl13wauvhv1M8hVndxNAV2BPoAtwipmdnn8o0lh9+pRmGfFy\n3lxIJC3OPTf0Npx+emG6pfPpbnoA6AhMZXWJcHf3/vGH0zDl3pKAcKUwcmRYQFUKPvggDIguWgQb\nb5x0NCLlbfnyUBPtiCPgiivye02cs5tmAp3T/ClcCUli+fJQ+O/pp0ujbPNNN4UyEvfem3QkIpXh\no49Ci+KGG+DEE+s+Ps7upjeAmCrDSEM1bw7nnVc6mxJpVpNIcbVtG0rf/PKXYdV8XHK2JMxsNGHa\n6ybAXsBEYHnt8+7eO74wGqcSWhKwuoz4xInhe1q99RYcdhjMn6+KryLFNnw4XHxx+JxYX4HQRnc3\nmdmh63uhuxd4dm7+KiVJQCgjvmRJ41ZaFtrvfgdffAF/+lPSkYhUpssvh+rqUGo+V7WGuKfAtgVq\n95SY6O7/zj/cwqukJJH2MuLuoSLp8OFh4ZaIFF9NDXz/+6GS9ODB2afOxzYmYWYnEbqaTgROAiaY\nmUqAJWSrreCkk8Lc6DQaPz7UA+rSJelIRCpXkybwwAPh77GxvQ75zG6aBhxZ23owszbA8+6e1xwb\nM+sJ3ExISHe7+3VrPd8buAqoAVYC57v7K9Fz3wTuIqz2rgHOcPcJWc5RMS0JSHcZ8XPOCYnst79N\nOhIReeedsInU0KFhk6pMcU6BnZ5Z8dXMmgDT8qkCGx07G+gBLAImAX3dfVbGMS3cfWl0+9vAcHfv\nFN2/F/iHuw+JVn23cPclWc5TUUkCQhnxrl3h179OOpLVVq4MFV/Hj0/3wLpIJamuDp8Xr766Zqn4\nOKfAPmtm/2dmPzazHwNPAfku6eoKzHH396JSHg8Da2y+V5sgIi0JLQbMrBVwsLsPiY77OluCqFSX\nXBLWIiw9huGAAAAPLklEQVRfXvexxTJmTBiPUIIQSY+qKhg0KGw7sKQBn6B1Jgl3vwi4E9gj+hrs\n7hfn+f7bAvMz7i+IHluDmfWJFu2NBs6IHt4B+NjMhpjZZDMbbGZauxvp0iUMYA8dmnQkqz34oNZG\niKTRWWfBIYeEv89Vq+o+PtP6psDuBLStHR/IePwg4AN3n1vnm5sdDxzl7mdG908DuuYq6RG990B3\nP9LM9gHGE3bB+6eZ3Qx87u4Ds7zOBw5c/XBVVRVVVVV1hVfyXnwxLJyZMaPhexrH5YsvYLvtwh4F\nbdokG4uIrGvMmGp+/vNq2rUL4xNXXHFFXt1NG6znuZuBS7M8/nn03DF5xLUQaJ9xf7vosazcfZyZ\n7WhmmxFaHfPd/Z/R0yOAS3K9dtCgQXmEU14OOywMXD/xBBx3XLKxPP542OlMCUIknY48sopJk6ro\n2hV22QUgvyJP67v+bOvu09d+MHps+zzjmgTsZGYdzKwZ0BdYo3C0mXXMuN0FaObun7j7R8B8M9s5\neroHMCPP81YEs7Ap0bXXhvUJSVIZDpH0a9MmXFT2r0d51vUlifXtU5XX2IC7rwLOAZ4D3gQedveZ\nZtbPzM6MDjvezN4ws8mEzY1OyniL/sBQM5tKKFV+TT7nrSR9+sCnnxZ+d6r1+eijsNdF79QUahGR\nXPbYIyywy9f6xiQeAl5097+t9fjPCOsmTm5EnLGqxCmwmZIuI37rrTBpUli8IyKlIY7aTW2Bx4AV\nwL+ih/cFmgHHufuHMcXaaJWeJJYvD1sXPv007LVX8c/frRtceSUcdVTxzy0iDRPnYrrDCCueAd50\n9xdjiC9WlZ4kIGxvOmVK2NC+mObMgYMPhgULYIP1TYMQkVSJtcBf2ilJhEUyO+wQun2KuZht0KAw\nJnLLLcU7p4g0Xtx7XEvKtWoF/frBjTcW75zumtUkUu7UkigjH30Eu+4aNv0pRhnxiRPhtNPC+bKV\nIhaR9FJLogK1bQt9+xavjHhtK0IJQqR8qSVRZubODbONCl1G/OuvQxmOceNCUT8RKS1qSVSojh3h\niCPqt1imIZ5/Hjp0UIIQKXdKEmWoGGXEhw4N4xEiUt6UJMrQ3nsXtoz4l1/C6NFhIxMRKW9KEmVq\nwAC4/vqwIXrcRo0KWyIWYwaViCRLSaJMZZYRj5s2FxKpHJrdVMZGjgytifHj45umungxfOtboQxH\ny5bxvKeIFJ9mN0lByogPHw5HH60EIVIplCTKWNOmcPHFYVOiuKgMh0hlUXdTmYuzjPjcuWHAeuFC\n2HDDeOITkWSou0kAaN4czj8frruu8e81bBicdJIShEglUUuiAsRRRtwdOnWCe++F7t1jDU9EEqCW\nhPxPbRnxG25o+HtMnhzqNXXrFl9cIpJ+aklUiNoy4rNmhWqx9XXBBWFG05VXxh+biBSfdqaTdZx1\nFmy2Gfz+9/V73apVoeJrdTXssktBQhORIlN3k6zjwgvhzjvDGEV9vPhiSBJKECKVR0migjS0jLjW\nRohULnU3VZgpU+CYY8Kah+bN6z5+6VLYdluYORO22qrw8YlIcai7SbLae2/Ybbf8y4iPHg377acE\nIVKplCQq0CWX5F9GXJsLiVQ2JYkKdNhhYe1EXWXE//MfGDsWjjuuOHGJSPooSVQgs9CauPbasJI6\nl0cegZ49w74UIlKZCp4kzKynmc0ys9lmdkmW53ub2TQzm2JmE83swIzn3s18rtCxVpI+feCzz9Zf\nRlyzmkSkoLObzKwJMBvoASwCJgF93X1WxjEt3H1pdPvbwHB37xTdnwfs4+6f1nEezW5qgLvughEj\n4Nln133u3XfDgPXChdCsWdFDE5ECS8vspq7AHHd/z91XAg8Dx2YeUJsgIi2BzOFUK0KMFeuHP4Tp\n02Hq1HWfGzYMTjxRCUKk0hX6A3hbYH7G/QXRY2swsz5mNhMYDZyR8ZQDY8xskpn9vKCRVqDmzeFX\nv1q3jLi7uppEJEjFVbq7Px51MfUBrs546kB37wJ8DzjbzA5KJMAy1q8fjBkD8+atfmzatLCI7oAD\nkotLRNJhgwK//0Kgfcb97aLHsnL3cWa2o5lt5u6fuPsH0eOLzewxQvfVuGyvHTRo0P9uV1VVUVVV\n1fjoK0CrVnDmmaGM+F/+Eh578EE49dQwC0pEykN1dTXV1dX1fl2hB66bAm8RBq4/ACYCp7j7zIxj\nOrr73Oh2F+AJd29nZi2AJu7+hZl9A3gOuMLdn8tyHg1cN0JmGfEttoD27UPronPnpCMTkULJd+C6\noC0Jd19lZucQPuCbAHe7+0wz6xee9sHA8WZ2OrACWAacFL28LfCYmXkU59BsCUIar21b6NsXbr0V\nevQI95UgRARU4E8ic+eGXecOPzx8//Wvk45IRApJmw5JvfXtC8OHw/z5ofKriJSvVHQ3SWm5/PIw\nHqEEISK11JIQEalAaVlxLSIiJUxJQkREclKSEBGRnJQkREQkJyUJERHJSUlCRERyUpIQEZGclCRE\nRCQnJQkREclJSUJERHJSkhARkZyUJEREJCclCRERyUlJQkREclKSEBGRnJQkREQkJyUJERHJSUlC\nRERyUpIQEZGclCRERCQnJQkREclJSUJERHJSkhARkZyUJEREJCclCRERyangScLMeprZLDObbWaX\nZHm+t5lNM7MpZjbRzA5c6/kmZjbZzEYVOlYREVlTQZOEmTUBbgeOAnYDTjGzXdc67Hl339Pd9wZ+\nCty11vPnATMKGWcxVFdXJx1CXhRnvBRnvBRn8RW6JdEVmOPu77n7SuBh4NjMA9x9acbdlkBN7R0z\n2w74HusmjpJTKr80ijNeijNeirP4Cp0ktgXmZ9xfED22BjPrY2YzgdHAGRlP3QRcBHghgxQRkexS\nMXDt7o+7eyegD3A1gJkdDXzk7lMBi75ERKSIzL1wF+lm1h0Y5O49o/sDAHf369bzmrnAfsCFwGnA\n18DGwCbAo+5+epbXqKUhIlJP7l7nxXehk0RT4C2gB/ABMBE4xd1nZhzT0d3nRre7AE+4e7u13udQ\n4Nfu3rtgwYqIyDo2KOSbu/sqMzsHeI7QtXW3u880s37haR8MHG9mpwMrgGXASYWMSURE8lfQloSI\niJS2VAxcN5SZ3W1mH5nZ60nHkouZbWdmL5rZm2Y23cz6Jx1TNmbW3MwmRIsap5vZwKRjyqVUFlia\n2buZC0WTjicbM/ummT1iZjOj39FuSce0NjPbOfo3nBx9/zzFf0fnm9kbZva6mQ01s2ZJx5SNmZ0X\n/Z3X+ZlU0i0JMzsI+AK43933SDqebMxsK2Ard59qZi2BfwHHuvushENbh5m1cPel0VjSK0B/d0/d\nh5uZnQ/sA7RK8ziVmc0D9nH3T5OOJRczuxf4h7sPMbMNgBbuviThsHKKFuguALq5+/y6ji8mM9sG\nGAfs6u4rzOzvwFPufn/Coa3BzHYDHiJMEPoaeAb4hbvPy3Z8Sbck3H0ckNo/QAB3/zCaxou7fwHM\nJMtakTTIWNjYnDBelboriBJbYGmk+G/MzFoBB7v7EAB3/zrNCSJyBDA3bQkiQ1PgG7UJF1iUcDzZ\ndAImuPtyd18FjAW+n+vg1P4ClyMz2x7YC5iQbCTZRd04U4APgTHuPinpmLIopQWWDowxs0lm9vOk\ng8liB+BjMxsSdeUMNrONkw6qDicTroJTx90XATcC7wMLgc/c/flko8rqDeBgM2ttZi0IF13tch2s\nJFEkUVfTCOC8qEWROu5eE9XQ2g7oZmadk44pUwkusDzQ3bsQ/gjPjrpH02QDoAvw5yjOpcCAZEPK\nzcw2BHoDjyQdSzZmtimh7FAHYBugpZmdmmxU64q6uq8DxgBPA1OAVbmOV5IogqjpOQJ4wN2fSDqe\nukRdDi8BPZOOZS0HAr2jvv6HgMPMLFX9vZnc/YPo+2LgMUItszRZAMx3939G90cQkkZafRf4V/Tv\nmUZHAPPc/ZOoG+dR4ICEY8rK3Ye4+77uXgV8BszOdWw5JIlSuKK8B5jh7rckHUguZraFmX0zur0x\ncCSQqsF1d/+Nu7d39x2BvsCL2Vbgp4GZtYhaj5jZN4DvEJr5qeHuHwHzzWzn6KEepLvi8imktKsp\n8j7Q3cw2MjMj/HvOrOM1iTCzNtH39sBxwLBcxxZ0MV2hmdkwoArY3MzeBwbWDsKlRbQ/xg+A6VF/\nvwO/cfdnk41sHVsD90WzR5oAf3f3pxOOqZS1BR6LSsZsAAx19+cSjimb/sDQqCtnHvCThOPJKuo7\nPwI4M+lYcnH3iWY2gtB9szL6PjjZqHIaaWabEeL85fomLJT0FFgRESmscuhuEhGRAlGSEBGRnJQk\nREQkJyUJERHJSUlCRERyUpIQEZGclCRERCQnJQmRAjGzH0Wl4hv6+n5mdlqcMYnUlxbTiRSImb0E\nXOju/0o6FpGGUktCKoqZdTCzGVFZ7DfM7Nmo1s5LZtYlOmZzM3snuv0jM3vMzJ4zs3lmdna0+9hk\nM3s1qvyZ7TzHA/sCD0bHNjezHtHtaWZ2V1QKAzN7x8yui3YzG29mO0aPDzSzC6LbHc1sjJlNNbN/\nmtkOZraVmf0jes/XoxIwIrFSkpBKtBNwm7vvTqiAeTzr7k+ReX83oA+hiuvvgS+i0trjgawFBt19\nJDAJODU6FmAIcKK77wlsCJyV8ZJPo90V/wxkKwQ5NIp5L0Jl0Q+BU4Fno/ffE5iax88uUi9KElKJ\n3nH36dHtycD2dRz/krsvdfePCUnlyejx6XW8NrNC8S6EMtJzo/v3AYdkHPtw9P0hoPsabxKqyW7j\n7qMA3H2Fuy8jJKGfmNnvgD3c/cs6fg6RelOSkEq0POP2KkKV1q9Z/few0XqO94z7NdSvkvL6Stp7\njts5X+vuLxMSzULgXg1ySyEoSUglyvZh/S5hDAHgxJjOswRoFd1+C+hQO94A/BCozjj25Oh7X+C1\nzDeJdjKcb2bHAphZMzPbONoL4N/ufjdhz+80bxgkJaqk95MQaaBs4w83AI9Ee1E/VY/Xrs99wB1m\nthTYHzgDGGFmTQldRXdmHNvazKYBXxE211nb6cCdZnYlsIKQyA4BLjKzlcB/yTE+ItIYmgIrkrBo\nJtU+7v5J0rGIrE3dTSLJ05WapJZaEiKNZGa3AwcSPuwt+n6Lu9+XaGAiMVCSEBGRnNTdJCIiOSlJ\niIhITkoSIiKSk5KEiIjkpCQhIiI5/T+znzSFGysMFAAAAABJRU5ErkJggg==\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f3455ff9c50>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"lm_list = evaluate_graph(dictionary, corpus, texts, 10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As we can see, 2 would be the best choice for the number of topics which is also the correct value since this corpus was designed to 2 topics, namely \"Human-Computer interaction\" and \"Graphs and trees\". Hence, just select `lm_list[1]` for your LdaModel"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Making LDA behave like LSI\n",
"Here we just have 2 topics however, the same approach can be used for multiple topics."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"lm = lm_list[1]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" [(u'user', 0.14710733043762769),\n",
" (u'system', 0.11091383992680816),\n",
" (u'response', 0.10774688179113835),\n",
" (u'time', 0.10233047821720406),\n",
" (u'trees', 0.087948794803798927),\n",
" (u'graph', 0.079485146451697894),\n",
" (u'survey', 0.07526115085274368),\n",
" (u'interface', 0.074644398934470682),\n",
" (u'eps', 0.069599948233048423),\n",
" (u'computer', 0.058999396488198623)]),\n",
" (1,\n",
" [(u'system', 0.13282887492911646),\n",
" (u'graph', 0.11537522576174111),\n",
" (u'human', 0.11181322589854704),\n",
" (u'trees', 0.10703405658911989),\n",
" (u'minors', 0.094033312724915177),\n",
" (u'computer', 0.087136990647558726),\n",
" (u'eps', 0.076689841646505566),\n",
" (u'interface', 0.071718390218816486),\n",
" (u'survey', 0.071110563443194746),\n",
" (u'user', 0.048731616175960051)])]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm.show_topics(formatted=False)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"coherence_values = {}\n",
"for n, topic in lm.show_topics(formatted=False):\n",
" topic = [word for word, _ in topic]\n",
" cm = CoherenceModel(topics=[topic], texts=texts, dictionary=dictionary, window_size=2)\n",
" coherence_values[n] = cm.get_coherence()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sorting in decreasing order of coherence values for ranking"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.40844152859807636), (1, 0.3367503796800731)]\n"
]
}
],
"source": [
"print sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Making your own coherence pipeline\n",
"Let's modify `c_uci` to use `s_one_pre` instead of `s_one_one` segmentation"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from gensim.topic_coherence import (segmentation, probability_estimation,\n",
" direct_confirmation_measure, indirect_confirmation_measure,\n",
" aggregation)\n",
"from gensim.matutils import argsort"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"measure = make_pipeline(segmentation.s_one_one,\n",
" probability_estimation.p_boolean_sliding_window,\n",
" direct_confirmation_measure.log_ratio_measure,\n",
" aggregation.arithmetic_mean)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To get topics out of the topic model"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"topics = []\n",
"for topic in lm.state.get_lambda():\n",
" bestn = argsort(topic, topn=10, reverse=True)\n",
"topics.append(bestn)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Perform segmentation\n",
"segmented_topics = measure.seg(topics)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Since this is a window-based coherence measure we will perform window based prob estimation\n",
"per_topic_postings, num_windows = measure.prob(texts=texts, segmented_topics=segmented_topics,\n",
" dictionary=dictionary, window_size=2)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=False)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-16.2529427009\n"
]
}
],
"source": [
"print measure.aggr(confirmed_measures)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
@chaithanyateegala
Copy link

hi,
i tried to run the same code to find optimal no. of topics but iam getting constant straight line in the graph.
could you please help me in this.

Thank you,

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment