Skip to content

Instantly share code, notes, and snippets.

@akelleh
Created February 8, 2015 23:35
Show Gist options
  • Save akelleh/41831fc1bc954ddce414 to your computer and use it in GitHub Desktop.
Save akelleh/41831fc1bc954ddce414 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"worksheets": [
{
"cells": [
{
"metadata": {},
"cell_type": "code",
"input": "import gensim\nfrom sklearn.datasets import fetch_20newsgroups\nfrom pprint import pprint\nimport matplotlib.pyplot as pp\nimport string",
"prompt_number": 161,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "newsgroups_train = fetch_20newsgroups( subset='train' )",
"prompt_number": 162,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "print newsgroups_train.filenames.shape; print newsgroups_train.target.shape",
"prompt_number": 163,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "(11314,)\n(11314,)\n"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "print type( newsgroups_train.data ); print type( newsgroups_train.data[ 0 ] )\nprint newsgroups_train.data[ 1000 ]",
"prompt_number": 164,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "<type 'list'>\n<type 'unicode'>\nFrom: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\nSubject: Diamond SS24X, Win 3.1, Mouse cursor\nOrganization: National Library of Medicine\nLines: 10\n\n\nAnybody seen mouse cursor distortion running the Diamond 1024x768x256 driver?\nSorry, don't know the version of the driver (no indication in the menus) but it's a recently\ndelivered Gateway system. Am going to try the latest drivers from Diamond BBS but wondered\nif anyone else had seen this.\n\npost or email\n\n--Don Lindbergh\ndabl2@lhc.nlm.nih.gov\n\n"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "table = string.maketrans(\"\",\"\")\ndata = [ [ word for word in document.encode( 'ascii', 'ignore' ).translate( table, string.punctuation ).lower().split() ] for document in newsgroups_train.data ]\n",
"prompt_number": 165,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#all_tokens = sum( data, [ ] )\n#tokens_to_ignore = set( word for word in set( all_tokens ) if all_tokens.count(word) <= 2 )",
"prompt_number": 6,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "token_count = { }\nfor doc in data:\n for word in doc:\n if word in token_count:\n token_count[ word ] += 1\n else:\n token_count[ word ] = 1\nstopwords = [ 'very','too','does','am','need','both','into','im','why','been','make','many','than','key','must','most','get','should','us','use','not', 'but','there','we','would', 'he','and','I', 'a', 'about', 'an', 'are', 'as', 'at', 'be', 'by', 'com', 'for', 'from','i', 'you', 'me', 'my','how','in', 'is', 'it', 'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'what', 'when', 'where','who', 'will', 'with', 'the', 'www','if','are','they','have','has','had' ]\ntokens_to_ignore = set( stopwords + [ word for word, count in token_count.items() if count <= 5 or count > 5000 ] )\nprint len( tokens_to_ignore )",
"prompt_number": 166,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "113672\n"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "data = [ [ word for word in document if word not in tokens_to_ignore ] for document in data ]",
"prompt_number": 167,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "print len( token_count )\nprint len( tokens_to_ignore )\nres = pp.hist( [ v for v in token_count.values( ) if v < 50 ], bins = 100 )",
"prompt_number": 168,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "138753\n113672\n"
},
{
"output_type": "display_data",
"metadata": {},
"png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEACAYAAABcXmojAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGBxJREFUeJzt3X+snuV93/H3J7gQEjxbViLzw4ag7jDFFVEIW5w1nfIw\nKHKjzPAHAkcrsiYrUud2yaqpGu4fjf2PEzJthGoCaSspxk1cvJKAozCCQ3O6ThtxQ6FxcVzMVGv2\nIT5EdmrKIpAN3/3xXOf2w+HY54cPPsec90s6Otfzva/rPtd9hZzPue/7uR+nqpAkCeA9cz0BSdL8\nYShIkjqGgiSpYyhIkjqGgiSpYyhIkjqThkKSTUmeT7I3yTeSXJRkWZLdSV5I8mSSpeP6H0iyP8nN\nA/Xr2z4OJLl3oH5Rkodb/ekkV83+YUqSpuKMoZDkQ8DngI9V1bXABcA64C5gd1VdAzzVXpNkFXAH\nsApYA9yXJG139wMbqmoIGEqyptU3AEdb/R7g7lk7OknStEx2pvAKcAJ4X5JFwPuAl4C1wLbWZxtw\na2vfAuyoqhNVdRB4EVid5DJgcVXtaf0eGhgzuK9HgBvP6ogkSTN2xlCoqmPAfwT+L/0w+Luq2g0s\nr6rR1m0UWN7alwOHB3ZxGLhigvpIq9O+H2o/7yRwPMmymR6QJGnmJrt89IvAvwU+RP8X+yVJfn2w\nT/U/J8PPypCkd4FFk2z/x8D/qqqjAEm+CfxT4EiSS6vqSLs09HLrPwKsHBi/gv4Zwkhrj6+PjbkS\neKldolrSzlDeIonBI0kzUFWZvFffZPcU9gOfSHJxu2F8E7AP+DawvvVZDzza2ruAdUkuTHI1MATs\nqaojwCtJVrf93Ak8NjBmbF+30b9xfboD86uKL37xi3M+h/ny5Vq4Fq7Fmb+m64xnClX1V0keAn4I\nvAn8JfBfgMXAziQbgIPA7a3/viQ76QfHSWBjnZrVRuBB4GLg8ap6otUfALYnOQAcpf/uJknSHJjs\n8hFV9RXgK+PKx+ifNUzUfyuwdYL6M8C1E9Rfp4WKJGlu+UTzeajX6831FOYN1+IU1+IU12LmMpNr\nTnMhSZ0vc5Wk+SIJNYs3miVJC4ihIEnqGAqSpI6hIEnqGAqSpI6hIEnqTPrw2nx34MAB3njjje71\nihUruOSSS+ZwRpJ0/jrvn1NYuvSDvPHGEpJFvPbaIb75zYf5zGc+MwczlKT5Z7rPKZz3ZwpvvAGv\nvvq/gQ+yZIlhIElnw3sKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6kwaCkn+UZJn\nB76OJ/l8kmVJdid5IcmTSZYOjNmU5ECS/UluHqhfn2Rv23bvQP2iJA+3+tNJrpr9Q5UkTWbSUKiq\nv6mq66rqOuB64OfAt4C7gN1VdQ3wVHtNklXAHcAqYA1wX5KxR6zvBzZU1RAwlGRNq28Ajrb6PcDd\ns3WAkqSpm+7lo5uAF6vqELAW2Nbq24BbW/sWYEdVnaiqg8CLwOoklwGLq2pP6/fQwJjBfT0C3Djd\nA5Eknb3phsI6YEdrL6+q0dYeBZa39uXA4YExh4ErJqiPtDrt+yGAqjoJHE+ybJpzkySdpSmHQpIL\ngX8B/Lfx29rHl54fH7cqSTqt6XxK6q8Bz1TVT9vr0SSXVtWRdmno5VYfAVYOjFtB/wxhpLXH18fG\nXAm8lGQRsKSqjo2fwObNm7t2r9ej1+tNY/qS9O43PDzM8PDwjMdP+d9TSPLHwH+vqm3t9Vfo3xy+\nO8ldwNKquqvdaP4G8HH6l4W+B/zDqqokPwA+D+wBvgP8flU9kWQjcG1V/esk64Bbq2rduJ8/4b+n\nsHjxB3n11X2MfXT2H/3Rb/jvKUhS8478ewpJ3k//JvPnBspfBnYm2QAcBG4HqKp9SXYC+4CTwMaB\n3+YbgQeBi4HHq+qJVn8A2J7kAHCU/r0LSdI5NqVQqKr/B3xgXO0Y/aCYqP9WYOsE9WeAayeov04L\nFUnS3PGJZklSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQ\nJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSZ0qhkGRpkj9J8uMk+5KsTrIsye4kLyR5\nMsnSgf6bkhxIsj/JzQP165PsbdvuHahflOThVn86yVWze5iSpKmY6pnCvcDjVfVh4CPAfuAuYHdV\nXQM81V6TZBVwB7AKWAPclyRtP/cDG6pqCBhKsqbVNwBHW/0e4O6zPjJJ0rRNGgpJlgD/rKq+BlBV\nJ6vqOLAW2Na6bQNube1bgB1VdaKqDgIvAquTXAYsrqo9rd9DA2MG9/UIcONZHZUkaUamcqZwNfDT\nJH+Y5C+T/Nck7weWV9Vo6zMKLG/ty4HDA+MPA1dMUB9pddr3Q9APHeB4kmUzOSBJ0swtmmKfjwG/\nVVV/keSrtEtFY6qqktQ7McFBmzdv7tq9Xo9er/dO/0hJOq8MDw8zPDw84/FTCYXDwOGq+ov2+k+A\nTcCRJJdW1ZF2aejltn0EWDkwfkXbx0hrj6+PjbkSeCnJImBJVR0bP5HBUJAkvd34P5i3bNkyrfGT\nXj6qqiPAoSTXtNJNwPPAt4H1rbYeeLS1dwHrklyY5GpgCNjT9vNKe+dSgDuBxwbGjO3rNvo3riVJ\n59hUzhQA/g3w9SQXAv8H+FfABcDOJBuAg8DtAFW1L8lOYB9wEthYVWOXljYCDwIX03830xOt/gCw\nPckB4Ciw7iyPS5I0A1MKhar6K+CfTLDpptP03wpsnaD+DHDtBPXXaaEiSZo7PtEsSeoYCpKkjqEg\nSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoY\nCpKkjqEgSeoYCpKkjqEgSepMKRSSHEzyoyTPJtnTasuS7E7yQpInkywd6L8pyYEk+5PcPFC/Psne\ntu3egfpFSR5u9aeTXDWbBylJmpqpnikU0Kuq66rq4612F7C7qq4BnmqvSbIKuANYBawB7kuSNuZ+\nYENVDQFDSda0+gbgaKvfA9x9lsclSZqB6Vw+yrjXa4Ftrb0NuLW1bwF2VNWJqjoIvAisTnIZsLiq\n9rR+Dw2MGdzXI8CN05iXJGmWTOdM4XtJfpjkc622vKpGW3sUWN7alwOHB8YeBq6YoD7S6rTvhwCq\n6iRwPMmy6RyIJOnsLZpiv09W1U+SfBDYnWT/4MaqqiQ1+9N7q82bN3ftXq9Hr9d7p3+kJJ1XhoeH\nGR4envH4KYVCVf2kff9pkm8BHwdGk1xaVUfapaGXW/cRYOXA8BX0zxBGWnt8fWzMlcBLSRYBS6rq\n2Ph5DIaCJOntxv/BvGXLlmmNn/TyUZL3JVnc2u8Hbgb2AruA9a3beuDR1t4FrEtyYZKrgSFgT1Ud\nAV5JsrrdeL4TeGxgzNi+bqN/41qSdI5N5UxhOfCt9gaiRcDXq+rJJD8EdibZABwEbgeoqn1JdgL7\ngJPAxqoau7S0EXgQuBh4vKqeaPUHgO1JDgBHgXWzcGySpGmaNBSq6m+Bj05QPwbcdJoxW4GtE9Sf\nAa6doP46LVQkSXPHJ5olSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLU\nMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSZ0phUKSC5I8m+Tb7fWyJLuT\nvJDkySRLB/puSnIgyf4kNw/Ur0+yt227d6B+UZKHW/3pJFfN5gFKkqZuqmcKXwD2AdVe3wXsrqpr\ngKfaa5KsAu4AVgFrgPuSpI25H9hQVUPAUJI1rb4BONrq9wB3n90hSZJmatJQSLIC+DTwB8DYL/i1\nwLbW3gbc2tq3ADuq6kRVHQReBFYnuQxYXFV7Wr+HBsYM7usR4MYZH40k6axM5UzhHuB3gDcHasur\narS1R4HlrX05cHig32HgignqI61O+34IoKpOAseTLJvGMUiSZsmiM21M8hng5ap6Nklvoj5VVUlq\nom2zbfPmzV271+vR6004JUlasIaHhxkeHp7x+DOGAvDLwNoknwbeC/yDJNuB0SSXVtWRdmno5dZ/\nBFg5MH4F/TOEkdYeXx8bcyXwUpJFwJKqOjbRZAZDQZL0duP/YN6yZcu0xp/x8lFV/W5Vrayqq4F1\nwJ9W1Z3ALmB967YeeLS1dwHrklyY5GpgCNhTVUeAV5Ksbjee7wQeGxgztq/b6N+4liTNgcnOFMYb\nu0z0ZWBnkg3AQeB2gKral2Qn/XcqnQQ2VtXYmI3Ag8DFwONV9USrPwBsT3IAOEo/fCRJc2DKoVBV\nfwb8WWsfA246Tb+twNYJ6s8A105Qf50WKpKkueUTzZKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoY\nCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKk\nzhlDIcl7k/wgyXNJ9iX5UqsvS7I7yQtJnkyydGDMpiQHkuxPcvNA/foke9u2ewfqFyV5uNWfTnLV\nO3GgkqTJnTEUquo14Iaq+ijwEeCGJL8C3AXsrqprgKfaa5KsAu4AVgFrgPuSpO3ufmBDVQ0BQ0nW\ntPoG4Gir3wPcPZsHKEmaukkvH1XVz1vzQuAC4GfAWmBbq28Dbm3tW4AdVXWiqg4CLwKrk1wGLK6q\nPa3fQwNjBvf1CHDjjI9GknRWJg2FJO9J8hwwCny/qp4HllfVaOsyCixv7cuBwwPDDwNXTFAfaXXa\n90MAVXUSOJ5k2cwOR5J0NhZN1qGq3gQ+mmQJ8N0kN4zbXknqnZrgoM2bN3ftXq9Hr9c7Fz9Wks4b\nw8PDDA8Pz3j8pKEwpqqOJ/kOcD0wmuTSqjrSLg293LqNACsHhq2gf4Yw0trj62NjrgReSrIIWFJV\nxyaaw2AoSJLebvwfzFu2bJnW+MneffSBsXcWJbkY+FXgWWAXsL51Ww882tq7gHVJLkxyNTAE7Kmq\nI8ArSVa3G893Ao8NjBnb1230b1xLkubAZGcKlwHbkryHfoBsr6qnkjwL7EyyATgI3A5QVfuS7AT2\nASeBjVU1dmlpI/AgcDHweFU90eoPANuTHACOAutm6+AkSdNzxlCoqr3AxyaoHwNuOs2YrcDWCerP\nANdOUH+dFiqSpLnlE82SpI6hIEnqGAqSpI6hIEnqGAqSpI6hIEnqGAqSpI6hIEnqGAqSpI6hIEnq\nGAqSpI6hIEnqGAqSpI6hIEnqGAqSpI6hIEnqGAqSpI6hIEnqGAqSpM6koZBkZZLvJ3k+yV8n+Xyr\nL0uyO8kLSZ5MsnRgzKYkB5LsT3LzQP36JHvbtnsH6hclebjVn05y1WwfqCRpclM5UzgB/HZV/RLw\nCeA3k3wYuAvYXVXXAE+11yRZBdwBrALWAPclSdvX/cCGqhoChpKsafUNwNFWvwe4e1aOTpI0LZOG\nQlUdqarnWvtV4MfAFcBaYFvrtg24tbVvAXZU1YmqOgi8CKxOchmwuKr2tH4PDYwZ3NcjwI1nc1CS\npJmZ1j2FJB8CrgN+ACyvqtG2aRRY3tqXA4cHhh2mHyLj6yOtTvt+CKCqTgLHkyybztxOM9+3fEmS\nzmzRVDsmuYT+X/FfqKq/H/wlW1WVpN6B+b3F5s2bu3av16PX601h1Ni0DAVJ737Dw8MMDw/PePyU\nQiHJL9APhO1V9Wgrjya5tKqOtEtDL7f6CLByYPgK+mcII609vj425krgpSSLgCVVdWz8PAZDQZL0\nduP/YN6yZcu0xk/l3UcBHgD2VdVXBzbtAta39nrg0YH6uiQXJrkaGAL2VNUR4JUkq9s+7wQem2Bf\nt9G/cS1JOsemcqbwSeDXgR8lebbVNgFfBnYm2QAcBG4HqKp9SXYC+4CTwMaqGruGsxF4ELgYeLyq\nnmj1B4DtSQ4AR4F1Z3lckqQZmDQUqup/cvoziptOM2YrsHWC+jPAtRPUX6eFiiRp7vhEsySpYyhI\nkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqG\ngiSpYyhIkjqGgiSpYyhIkjqThkKSryUZTbJ3oLYsye4kLyR5MsnSgW2bkhxIsj/JzQP165Psbdvu\nHahflOThVn86yVWzeYCSpKmbypnCHwJrxtXuAnZX1TXAU+01SVYBdwCr2pj7kqSNuR/YUFVDwFCS\nsX1uAI62+j3A3WdxPJKkszBpKFTVnwM/G1deC2xr7W3Ara19C7Cjqk5U1UHgRWB1ksuAxVW1p/V7\naGDM4L4eAW6cwXFIkmbBTO8pLK+q0dYeBZa39uXA4YF+h4ErJqiPtDrt+yGAqjoJHE+ybIbzkiSd\nhbO+0VxVBdQszEWSNMcWzXDcaJJLq+pIuzT0cquPACsH+q2gf4Yw0trj62NjrgReSrIIWFJVxyb6\noZs3b+7avV6PXq83w+lL0rvT8PAww8PDMx4/01DYBaynf1N4PfDoQP0bSf4T/ctCQ8CeqqokryRZ\nDewB7gR+f9y+ngZuo3/jekKDoSBJervxfzBv2bJlWuMnDYUkO4BPAR9Icgj4PeDLwM4kG4CDwO0A\nVbUvyU5gH3AS2NguLwFsBB4ELgYer6onWv0BYHuSA8BRYN20jkCSNGsmDYWq+uxpNt10mv5bga0T\n1J8Brp2g/jotVM6VU++Sfcs8zuUUJGlemunlo3eBwRB4e0hI0kLkx1xIkjqGgiSpYyhIkjqGgiSp\nYyhIkjqGgiSpYyhIkjoL+DmFiY1/sM2H2iQtJJ4pTMgPfpW0MBkKkqSOoSBJ6hgKkqSOoSBJ6vju\noyny47YlLQSGwrT4cduS3t0MhVngsw2S3i28pzBrfLZB0vnPM4V3kGcQks438+ZMIcmaJPuTHEjy\n7+d6PrNn4jOIJG/5kqT5YF6EQpILgP8MrAFWAZ9N8uG5ndW58PbAmEpYDA8Pv/NTO0+4Fqe4Fqe4\nFjM3L0IB+DjwYlUdrKoTwB8Dt8zxnObQmcPihhtueEtgnC5IxtffjWck/p//FNfiFNdi5uZLKFwB\nHBp4fbjV9BZjYfHFM2w7Xf309zNmEirTrU+2TdL8MF9uNM/4DmwCixf/S5KLeO21PcBvzOK0FpKx\n/wnG/7I+07MZk42Z6Bf/27ed6Yb86bYN1rds2TJh/UxjplKfbNvp+rxTP3Oqa3Guf/5Mxww62/ET\n7WuytZjKz1mIbxbJfDjIJJ8ANlfVmvZ6E/BmVd090GfuJypJ56GqmvKp+XwJhUXA3wA3Ai8Be4DP\nVtWP53RikrTAzIvLR1V1MslvAd8FLgAeMBAk6dybF2cKkqT5Yb68++i03r0PtU0uydeSjCbZO1Bb\nlmR3kheSPJlk6VzO8VxJsjLJ95M8n+Svk3y+1RfceiR5b5IfJHkuyb4kX2r1BbcWY5JckOTZJN9u\nrxfkWiQ5mORHbS32tNq01mJeh8LCfait84f0j33QXcDuqroGeKq9XghOAL9dVb8EfAL4zfbfwoJb\nj6p6Dbihqj4KfAS4IcmvsADXYsAXgH2cenvbQl2LAnpVdV1VfbzVprUW8zoUWOAPtVXVnwM/G1de\nC2xr7W3Ared0UnOkqo5U1XOt/SrwY/rPsizU9fh5a15I/z7cz1iga5FkBfBp4A849V7nBbkWzfh3\nGk1rLeZ7KPhQ29str6rR1h4Fls/lZOZCkg8B1wE/YIGuR5L3JHmO/jF/v6qeZ4GuBXAP8DvAmwO1\nhboWBXwvyQ+TfK7VprUW8+LdR2fgXfAzqKpaaM9vJLkEeAT4QlX9/eDDRQtpParqTeCjSZYA301y\nw7jtC2ItknwGeLmqnk3Sm6jPQlmL5pNV9ZMkHwR2J9k/uHEqazHfzxRGgJUDr1fSP1tYyEaTXAqQ\n5DLg5TmezzmT5BfoB8L2qnq0lRfsegBU1XHgO8D1LMy1+GVgbZK/BXYA/zzJdhbmWlBVP2nffwp8\ni/4l+GmtxXwPhR8CQ0k+lORC4A5g1xzPaa7tAta39nrg0TP0fddI/5TgAWBfVX11YNOCW48kHxh7\nB0mSi4FfBZ5lAa5FVf1uVa2sqquBdcCfVtWdLMC1SPK+JItb+/3AzcBeprkW8/45hSS/BnyVUw+1\nfWmOp3TOJNkBfAr4AP1rgb8HPAbsBK4EDgK3V9XfzdUcz5X27pr/AfyIU5cVN9F/+n1BrUeSa+nf\nMHxP+9peVf8hyTIW2FoMSvIp4N9V1dqFuBZJrqZ/dgD9WwNfr6ovTXct5n0oSJLOnfl++UiSdA4Z\nCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkzv8HHdrlNjOhK6QAAAAASUVORK5CYII=\n",
"text": "<matplotlib.figure.Figure at 0x116854810>"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "print data[ : 5 ]",
"prompt_number": 169,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "[['wheres', 'thing', 'car', 'nntppostinghost', 'rac3wamumdedu', 'maryland', 'college', 'park', '15', 'wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', '2door', 'sports', 'car', 'looked', 'late', '60s', 'early', '70s', 'called', 'doors', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'anyone', 'model', 'name', 'engine', 'specs', 'years', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please', 'email', 'thanks', 'il', 'brought', 'neighborhood'], ['guykuocarsonuwashingtonedu', 'guy', 'kuo', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'si', 'clock', 'reports', 'keywords', 'articleid', 'washington', '11', 'nntppostinghost', 'carsonuwashingtonedu', 'fair', 'number', 'brave', 'souls', 'upgraded', 'si', 'clock', 'oscillator', 'shared', 'experiences', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experiences', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'cards', 'adapters', 'heat', 'sinks', 'hour', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', '800', '14', 'm', 'floppies', 'especially', 'requested', 'summarizing', 'next', 'two', 'days', 'please', 'add', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'havent', 'answered', 'poll', 'thanks', 'guy', 'kuo', 'guykuouwashingtonedu'], ['thomas', 'e', 'willis', 'pb', 'questions', 'purdue', 'engineering', 'computer', 'network', 'distribution', 'usa', '36', 'well', 'folks', 'mac', 'plus', 'finally', 'gave', 'ghost', 'weekend', 'after', 'starting', 'life', '512k', 'way', 'back', '1985', 'market', 'new', 'machine', 'bit', 'sooner', 'intended', 'looking', 'picking', 'powerbook', '160', 'maybe', '180', 'bunch', 'questions', 'hopefully', 'somebody', 'answer', 'anybody', 'dirt', 'next', 'round', 'powerbook', 'expected', 'id', 'heard', 'supposed', 'summer', 'havent', 'heard', 'anymore', 'since', 'access', 'wondering', 'anybody', 'info', 'anybody', 'heard', 'rumors', 'price', 'drops', 'powerbook', 'line', 'ones', 'duos', 'went', 'through', 'recently', 'whats', 'impression', 'display', '180', 'could', 'probably', 'swing', '180', 'got', '80mb', 'disk', 'rather', '120', 'really', 'feel', 'much', 'better', 'display', 'yea', 'looks', 'great', 'store', 'wow', 'really', 'good', 'could', 'solicit', 'opinions', '160', '180', 'daytoday', 'worth', 'taking', 'disk', 'size', 'money', 'hit', 'active', 'display', 'realize', 'real', 'subjective', 'question', 'ive', 'only', 'played', 'around', 'machines', 'computer', 'store', 'figured', 'opinions', 'somebody', 'actually', 'uses', 'machine', 'daily', 'might', 'prove', 'helpful', 'well', 'perform', 'thanks', 'bunch', 'advance', 'info', 'could', 'email', 'ill', 'post', 'summary', 'news', 'reading', 'time', 'premium', 'finals', 'around', 'corner', 'tom', 'willis', 'purdue', 'electrical', 'engineering', 'convictions', 'dangerous', 'enemies', 'truth', 'lies', 'f', 'w', 'nietzsche'], ['joe', 'green', 'weitek', 'p9000', 'harris', 'computer', 'systems', 'division', '14', 'distribution', 'world', 'nntppostinghost', 'xnewsreader', 'tin', 'version', '11', 'pl9', 'robert', 'jc', 'kyanko', 'robrjckuucp', 'wrote', 'anyone', 'weitek', 'p9000', 'graphics', 'chip', 'far', 'lowlevel', 'stuff', 'goes', 'looks', 'pretty', 'nice', 'got', 'fill', 'command', 'requires', 'four', 'points', 'number', 'id', 'information', 'chip', 'joe', 'green', 'harris', 'corporation', 'computer', 'systems', 'division', 'only', 'thing', 'really', 'scares', 'person', 'sense', 'humor', 'jonathan', 'winters'], ['jonathan', 'mcdowell', 'shuttle', 'launch', 'question', 'smithsonian', 'astrophysical', 'observatory', 'cambridge', 'ma', 'usa', 'distribution', 'sci', '23', 'tom', 'baker', 'pack', 'rat', 'clear', 'caution', 'warning', 'memory', 'verify', 'unexpected', 'errors', 'wondering', 'expected', 'error', 'might', 'sorry', 'really', 'dumb', 'question', 'parity', 'errors', 'memory', 'previously', 'known', 'conditions', 'yes', 'error', 'already', 'knew', 'id', 'curious', 'real', 'meaning', 'quote', 'tom', 'understanding', 'expected', 'errors', 'basically', 'known', 'bugs', 'warning', 'system', 'software', 'things', 'checked', 'right', 'values', 'yet', 'because', 'arent', 'set', 'till', 'after', 'launch', 'rather', 'fix', 'code', 'possibly', 'introduce', 'new', 'bugs', 'tell', 'crew', 'ok', 'see', 'warning', '213', 'before', 'liftoff', 'ignore', 'jonathan']]\n"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "dictionary = gensim.corpora.Dictionary( data )\nid2word = { v : k for k, v in dictionary.token2id.items( ) }",
"prompt_number": 170,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "corpus = [ dictionary.doc2bow( text ) for text in data ]\n",
"prompt_number": 171,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "print corpus[0]",
"prompt_number": 172,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 1), (10, 5), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1)]\n"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "lda = gensim.models.ldamodel.LdaModel( corpus=corpus, id2word=id2word, num_topics = 20, update_every=0, passes=10 )",
"prompt_number": 173,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "lda.print_topics(-1)",
"prompt_number": 159,
"outputs": [
{
"output_type": "pyout",
"prompt_number": 159,
"metadata": {},
"text": "[u'0.008*file + 0.004*right + 0.004*use + 0.004*government + 0.004*gun + 0.003*law + 0.003*only + 0.003*state + 0.003*entry + 0.003*should',\n u'0.004*get + 0.004*them + 0.004*been + 0.004*then + 0.004*good + 0.003*nntppostinghost + 0.003*think + 0.003*gun + 0.003*now + 0.003*his',\n u'0.011*2 + 0.010*0 + 0.009*4 + 0.008*db + 0.007*3 + 0.006*5 + 0.006*7 + 0.006*6 + 0.005*8 + 0.005*10',\n u'0.006*nntppostinghost + 0.005*bike + 0.004*need + 0.004*distribution + 0.004*please + 0.004*email + 0.003*only + 0.003*anyone + 0.003*does + 0.003*time',\n u'0.050*maxaxaxaxaxaxaxaxaxaxaxaxaxaxax + 0.009*turkish + 0.008*armenian + 0.007*armenians + 0.004*armenia + 0.004*turks + 0.004*them + 0.004*been + 0.003*turkey + 0.003*serdar',\n u'0.005*than + 0.005*key + 0.003*nntppostinghost + 0.003*only + 0.003*much + 0.003*such + 0.003*should + 0.003*most + 0.003*get + 0.003*could',\n u'0.008*key + 0.008*chip + 0.007*clipper + 0.006*encryption + 0.004*system + 0.004*keys + 0.004*use + 0.004*government + 0.004*technology + 0.004*window',\n u'0.007*2 + 0.006*game + 0.004*team + 0.004*3 + 0.004*his + 0.004*nntppostinghost + 0.004*go + 0.003*hockey + 0.003*4 + 0.003*think',\n u'0.004*science + 0.004*jesus + 0.004*think + 0.004*god + 0.003*nntppostinghost + 0.003*law + 0.003*does + 0.003*only + 0.003*these + 0.003*should',\n u'0.020*0 + 0.010*2 + 0.006*3 + 0.004*new + 0.004*gordon + 0.004*banks + 0.004*4 + 0.004*time + 0.003*get + 0.003*period',\n u'0.006*windows + 0.005*use + 0.005*drive + 0.004*get + 0.004*nntppostinghost + 0.004*file + 0.004*card + 0.004*system + 0.004*also + 0.003*thanks',\n u'0.004*nntppostinghost + 0.004*am + 0.003*them + 0.003*only + 0.003*his + 0.003*mouse + 0.003*then + 0.003*im + 0.003*does + 0.003*get',\n u'0.006*use + 0.005*nntppostinghost + 0.004*widget + 0.004*window + 0.004*get + 0.003*problem + 0.003*then + 0.003*motif + 0.003*also + 0.003*am',\n u'0.010*israel + 0.008*israeli + 0.004*nntppostinghost + 0.004*only + 0.003*state + 0.003*arab + 0.003*jews + 0.003*think + 0.003*new + 0.003*peace',\n u'0.006*his + 0.005*think + 0.004*been + 0.004*god + 0.004*them + 0.004*why + 0.003*than + 0.003*only + 0.003*because + 0.003*us',\n u'0.010*space + 0.006*information + 0.005*available + 0.004*data + 0.004*also + 0.004*m + 0.003*may + 0.003*nasa + 0.003*system + 0.003*program',\n u'0.006*his + 0.006*team + 0.006*year + 0.005*than + 0.005*players + 0.005*good + 0.004*think + 0.003*better + 0.003*last + 0.003*games',\n u'0.004*nntppostinghost + 0.004*than + 0.003*use + 0.003*car + 0.003*does + 0.003*could + 0.003*distribution + 0.003*get + 0.003*them + 0.003*need',\n u'0.004*nntppostinghost + 0.003*get + 0.003*new + 0.003*use + 0.003*them + 0.003*center + 0.003*than + 0.003*does + 0.003*also + 0.003*only',\n u'0.009*god + 0.006*them + 0.005*her + 0.005*she + 0.005*his + 0.005*our + 0.005*us + 0.004*then + 0.004*because + 0.004*him']"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "def topic_to_likelihoods( word_count, topic ):\n topic_distribution = { }\n word_totals = sum( word_count.values() )\n word_distribution = { k : float( v )/ word_totals for k,v in word_count.items() }\n for prob, word in topic:\n topic_distribution[ word ] = prob\n word_likelihoods = { word: topic_distribution[ word ] / word_distribution[ word ] for word in topic_distribution.keys( ) if word in topic_distribution }\n return topic_distribution, word_distribution, word_likelihoods",
"prompt_number": 174,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "top = lda.show_topic( 4, topn=-1 )\ntopic_dist, word_dist, likelihoods = topic_to_likelihoods( token_count, top )\nprint sorted( [ ( v, k ) for k, v in likelihoods.items() ], reverse=True )[ :100 ]\nprint sorted( [ ( v, k ) for k, v in topic_dist.items() ], reverse=True )[ : 100 ]",
"prompt_number": 175,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "[(35.881762699360699, u'kubilay'), (35.766266392620423, u'rcollinsnsencorecom'), (35.710401445190001, u'depositors'), (35.651081906924894, u'oba'), (35.600863337157229, u'uunetiscbrthorsteveh'), (35.516865674786857, u'reparations'), (35.511159187026742, u'waldocybernetcsefauedu'), (35.453306660813176, u'thrush'), (35.392504720475642, u'0455'), (35.37084637883077, u'8388826'), (35.354377664200335, u'heghinian'), (35.327326511125555, u'appoint'), (35.282840994667453, u'sy'), (35.252088474323131, u'hofferbert'), (35.160744307367409, u'nonturkish'), (35.049554991671776, u'0182'), (35.005586529731374, u'gilkey'), (34.957099140688165, u'abus'), (34.858241872663669, u'hovig'), (34.792935890781898, u'agdam'), (34.773959473241099, u'bullst'), (34.772214537144464, u'khojaly'), (34.560920613320043, u'thoriscbrcom'), (34.369315786970795, u'mitsotakis'), (34.327146847567072, u'selim'), (34.191435496416972, u'jsh'), (34.1248905746058, u'fouryear'), (34.107445149933881, u'lankford'), (34.075418224664659, u'turgut'), (34.044178218493784, u'joesbarccvtedu'), (33.980548565125858, u'pedersen'), (33.937529407121971, u'whiten'), (33.8709252184589, u'yhvh'), (33.866381608423112, u'auschwitz'), (33.862979024246492, u'nagornokarabagh'), (33.857903939127979, u'peaceloving'), (33.830969745882207, u'hoviguxacsouiucedu'), (33.770994066893465, u'timucin'), (33.606466283501895, u'vedat'), (33.55594052002531, u'argics'), (33.529934104619315, u'evacuation'), (33.426142908867284, u'polymorph'), (33.41557419293374, u'syseditexe'), (33.315451461954027, u'utopian'), (33.314686015849354, u'afdc'), (33.288350164324989, u'0214'), (33.286121131554005, u'republication'), (33.278624882644628, u'admiraljhunixhcfjhuedu'), (33.247866838682747, u'bangles'), (33.224532522629353, u'1412148638'), (33.223311769871266, u'0310'), (33.139116575903984, u'0545'), (33.098927364563657, u'bream'), (33.068831720132138, u'dogan'), (33.06673238131021, u'djul'), (32.943484443779845, u'maddux'), (32.902944585539359, u'kupajava'), (32.901949604863951, u'vd8cunixbcccolumbiaedu'), (32.900896215144407, u'stevehthoriscbrcom'), (32.855823933612342, u'bigs'), (32.843901008133827, u'mr47andrewcmuedu'), (32.810864712619917, u'krakatoa'), (32.656125184067541, u'karabag'), (32.652309390852423, u'fatherland'), (32.612921321827315, u'frungy'), (32.596888897626798, u'0438'), (32.596056659266878, u'marxism'), (32.577908840586311, u'lopez'), (32.548374579823623, u'barbers'), (32.533896915515086, u'devineni'), (32.487172206905313, u'hendricks'), (32.432663586229005, u'herod'), (32.393872728068928, u'eichmann'), (32.357935961027522, u'mamatha'), (32.326708527651078, u'gt1091a'), (32.22000133251376, u'0417'), (32.098859226656138, u'catchers'), (31.979571056402715, u'commemorative'), (31.904269114197216, u'deportation'), (31.883074496350776, u'3392'), (31.749031668457175, u'vay'), (31.742933749359864, u'azeri'), (31.62842815841881, u'resorted'), (31.557306821165259, u'libertarians'), (31.521663461477313, u'pagnozzi'), (31.514556164397089, u'asalasdpaarf'), (31.263420154231284, u'itartass'), (31.238577349689059, u'no2'), (31.211769768510614, u'0353'), (31.157039897578695, u'capitalists'), (31.053882209428426, u'sadikov'), (30.942397292001811, u'yalanci'), (30.82713489786072, u'profs'), (30.821451137527287, u'i2c'), (30.820666518591548, u'afs'), (30.779443308906835, u'0304'), (30.715397660340106, u'villanueva'), (30.652321999968009, u'mrdavidian'), (30.609586796058316, u'wehrmacht'), (30.497864453023979, u'malefemale')]\n[(0.0081985657914655334, u'his'), (0.0061157485211848555, u'turkish'), (0.0048620457873832, u'armenian'), (0.0043217117264888541, u'government'), (0.0042182813417435926, u'armenians'), (0.0032509947902106848, u'them'), (0.0030422995938456619, u'think'), (0.003021356622458725, u'said'), (0.0029006193872216516, u'even'), (0.0028445666035166873, u'then'), (0.0028278891456574467, u'armenia'), (0.0027316177364712546, u'jews'), (0.0027097148141743692, u'him'), (0.0026384345295893284, u'first'), (0.002596845933175748, u'now'), (0.002583862503745993, u'well'), (0.0025192990117336796, u'year'), (0.0024734329640932742, u'turks'), (0.0024319243783666091, u'only'), (0.002420499814155052, u'time'), (0.0024134296180960673, u'good'), (0.0023095791476433441, u'our'), (0.0022797841872107226, u'also'), (0.0022411436141505326, u'last'), (0.0022386950496894901, u'world'), (0.0022219633325529955, u'genocide'), (0.0021744122201299287, u'could'), (0.002157495665917717, u'serdar'), (0.0021324412858504671, u'such'), (0.00208541473573814, u'argic'), (0.0020785854818008444, u'david'), (0.0020743099079407255, u'because'), (0.0020520559040526276, u'years'), (0.0020212540039353261, u'after'), (0.0020147164698575519, u'did'), (0.0020126883796364866, u'over'), (0.0019803592860442145, u'two'), (0.0019648703392403607, u'those'), (0.001958866719279351, u'same'), (0.001952312259138113, u'way'), (0.0018868362126230999, u'being'), (0.0018690350762697141, u'see'), (0.0018427076489938158, u'turkey'), (0.0018050769338437526, u'nntppostinghost'), (0.001793969420086333, u'still'), (0.0017886050394469314, u'back'), (0.0017613072225039434, u'against'), (0.001754114370770587, u'down'), (0.001723435618521114, u'much'), (0.001648759780405396, u'go'), (0.0016316602338800855, u'might'), (0.0015760929346156347, u'say'), (0.001560053266019919, u'before'), (0.0015554093477963208, u'new'), (0.0015091410165327939, u'today'), (0.0014815833786984919, u'these'), (0.0014547299360164131, u'again'), (0.0014523902922549763, u'didnt'), (0.0014521102186808451, u'killed'), (0.0014504477945279375, u'work'), (0.0014442424479523385, u'cant'), (0.0014379154545509614, u'book'), (0.0014375373583734642, u'women'), (0.0014340592532848277, u'never'), (0.0013820895061681182, u'during'), (0.0013488076049376807, u'muslim'), (0.001319137733269158, u'right'), (0.0012969392332219647, u'greek'), (0.0012766184820891422, u'war'), (0.0012729874407731727, u'really'), (0.0012713904067018801, u'million'), (0.0012712410220209463, u'replyto'), (0.0012698384834601199, u'while'), (0.0012696574641748132, u'own'), (0.0012615007842411293, u'made'), (0.001257511768210011, u'going'), (0.001246672520942547, u'want'), (0.0012345102275016569, u'another'), (0.0012192339543301697, u'here'), (0.0011993656728785721, u'take'), (0.0011905242666798018, u'serazumauucp'), (0.0011898921106035802, u'away'), (0.001189693348834965, u'better'), (0.001188809286142559, u'through'), (0.0011863104687060335, u'fact'), (0.0011821451904470393, u'steve'), (0.0011675596647505934, u'history'), (0.0011623943383006689, u'children'), (0.0011570581084318504, u'look'), (0.0011568305933733287, u'thing'), (0.0011397685649776324, u'every'), (0.001138205044051612, u'may'), (0.0011308882461661639, u'nazi'), (0.001120846617745248, u'soviet'), (0.0011191648944761921, u'sure'), (0.0011074444019334654, u'mr'), (0.001103145831546942, u'men'), (0.0011014421182270987, u'distribution'), (0.0010951615347741376, u'since'), (0.0010929837145707364, u'around')]"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "sorted( [ (count, tok ) for tok, count in token_count.items() ], reverse=True )[ : 300 ]",
"prompt_number": 145,
"outputs": [
{
"output_type": "pyout",
"prompt_number": 145,
"metadata": {},
"text": "[(146332, 'the'),\n (72647, 'to'),\n (68883, 'of'),\n (60943, 'a'),\n (57452, 'and'),\n (48662, 'in'),\n (43278, 'is'),\n (41559, 'i'),\n (37736, 'that'),\n (29529, 'it'),\n (28534, 'for'),\n (24609, 'you'),\n (22551, 'from'),\n (20096, 'on'),\n (20073, 'this'),\n (19176, 'be'),\n (18599, 'are'),\n (18382, 'not'),\n (18048, 'have'),\n (17301, 'with'),\n (15468, 'as'),\n (14364, 'or'),\n (13593, 'was'),\n (13562, 'if'),\n (13308, 'but'),\n (12253, 'subject'),\n (12210, 'they'),\n (11824, 'lines'),\n (11185, 'organization'),\n (11056, 'by'),\n (10953, 'at'),\n (10011, 'an'),\n (9615, 'my'),\n (9603, 'can'),\n (9327, 'what'),\n (9241, 'will'),\n (9132, 'all'),\n (8903, 'there'),\n (8874, 'would'),\n (8654, 'one'),\n (8625, 'do'),\n (8330, 'about'),\n (8132, 'we'),\n (8023, 're'),\n (7836, 'writes'),\n (7606, 'your'),\n (7577, 'he'),\n (7512, 'has'),\n (7508, 'so'),\n (7221, 'no'),\n (7017, 'any'),\n (6754, 'article'),\n (6698, 'me'),\n (6622, 'some'),\n (6371, 'who'),\n (6179, 'its'),\n (6062, 'out'),\n (6059, 'which'),\n (5935, 'were'),\n (5857, 'more'),\n (5833, 'people'),\n (5814, 'dont'),\n (5760, 'like'),\n (5684, 'when'),\n (5589, 'just'),\n (5554, '1'),\n (5544, 'university'),\n (5468, 'their'),\n (5457, 'x'),\n (5202, 'how'),\n (5153, 'other'),\n (5116, 'know'),\n (5073, 'up'),\n (4975, 'only'),\n (4943, 'get'),\n (4797, 'them'),\n (4781, 'nntppostinghost'),\n (4695, 'than'),\n (4630, 'had'),\n (4522, 'been'),\n (4503, 'think'),\n (4444, 'his'),\n (4296, 'also'),\n (4168, '2'),\n (4155, 'use'),\n (4125, 'does'),\n (3944, 'then'),\n (3922, 'new'),\n (3865, 'time'),\n (3727, 'im'),\n (3719, 'good'),\n (3675, 'these'),\n (3622, 'should'),\n (3495, 'could'),\n (3460, 'well'),\n (3406, 'us'),\n (3336, 'because'),\n (3307, 'maxaxaxaxaxaxaxaxaxaxaxaxaxaxax'),\n (3302, 'am'),\n (3271, 'may'),\n (3259, 'even'),\n (3229, 'why'),\n (3225, 'very'),\n (3179, 'now'),\n (3078, 'into'),\n (3034, 'see'),\n (2934, 'many'),\n (2927, 'way'),\n (2894, 'first'),\n (2892, 'those'),\n (2881, 'make'),\n (2874, 'two'),\n (2863, 'much'),\n (2848, 'most'),\n (2780, 'such'),\n (2763, 'distribution'),\n (2702, 'system'),\n (2698, 'say'),\n (2680, 'right'),\n (2672, 'where'),\n (2598, '3'),\n (2557, 'god'),\n (2515, 'want'),\n (2490, 'our'),\n (2459, 'here'),\n (2446, 'said'),\n (2444, 'being'),\n (2442, 'world'),\n (2438, '0'),\n (2434, 'used'),\n (2428, 'did'),\n (2428, 'anyone'),\n (2411, 'same'),\n (2410, 'after'),\n (2406, 'need'),\n (2403, 'go'),\n (2371, 'over'),\n (2342, 'work'),\n (2244, 'too'),\n (2237, '4'),\n (2223, 'something'),\n (2207, 'problem'),\n (2174, 'really'),\n (2143, 'please'),\n (2108, 'him'),\n (2088, 'computer'),\n (2069, 'since'),\n (2022, 'still'),\n (2021, 'believe'),\n (2001, 'going'),\n (2000, '5'),\n (1984, 'years'),\n (1982, 'email'),\n (1959, 'back'),\n (1958, 'information'),\n (1940, 'file'),\n (1900, 'help'),\n (1895, 'using'),\n (1888, 'off'),\n (1881, 'find'),\n (1862, 'take'),\n (1854, 'question'),\n (1841, 'thanks'),\n (1839, 'before'),\n (1833, 'state'),\n (1833, 'must'),\n (1814, 'ive'),\n (1803, 'last'),\n (1801, 'point'),\n (1797, 'usa'),\n (1791, 'things'),\n (1783, 'while'),\n (1782, 'better'),\n (1757, 'cant'),\n (1754, 'might'),\n (1742, 'replyto'),\n (1736, 'own'),\n (1733, 'both'),\n (1729, 'never'),\n (1717, 'government'),\n (1714, 'year'),\n (1698, 'number'),\n (1694, 'another'),\n (1693, 'sure'),\n (1673, 'without'),\n (1661, '10'),\n (1653, 'program'),\n (1648, 'read'),\n (1642, 'windows'),\n (1624, 'got'),\n (1621, 'through'),\n (1616, 'space'),\n (1602, 'case'),\n (1589, 'down'),\n (1578, 'made'),\n (1575, 'data'),\n (1573, 'etc'),\n (1567, 'available'),\n (1566, 'drive'),\n (1553, 'c'),\n (1531, 'doesnt'),\n (1530, 'under'),\n (1528, 'david'),\n (1519, 'thing'),\n (1516, 'someone'),\n (1498, 'look'),\n (1487, 'between'),\n (1485, 'thats'),\n (1482, '6'),\n (1475, 'part'),\n (1471, 'few'),\n (1465, 'little'),\n (1456, 'come'),\n (1453, 'didnt'),\n (1445, 'power'),\n (1440, 'law'),\n (1433, '8'),\n (1430, 'version'),\n (1424, 'however'),\n (1421, 'each'),\n (1409, 'long'),\n (1403, 'software'),\n (1396, 'anything'),\n (1392, '7'),\n (1392, '11'),\n (1391, 'around'),\n (1385, 'fact'),\n (1380, 'science'),\n (1379, 'give'),\n (1375, 'every'),\n (1374, 'probably'),\n (1362, 'best'),\n (1355, 'true'),\n (1347, 'key'),\n (1347, 'again'),\n (1336, 'id'),\n (1335, 'john'),\n (1321, 'day'),\n (1306, 'public'),\n (1305, 'least'),\n (1305, 'course'),\n (1301, 'against'),\n (1299, 'tell'),\n (1299, 'seems'),\n (1297, '20'),\n (1291, 'different'),\n (1281, 'systems'),\n (1279, 'set'),\n (1274, 'group'),\n (1263, 'great'),\n (1263, 'enough'),\n (1262, 'put'),\n (1255, 'car'),\n (1245, 'try'),\n (1241, '15'),\n (1239, '12'),\n (1238, 'name'),\n (1237, 'lot'),\n (1225, 'says'),\n (1222, 'research'),\n (1219, '1993'),\n (1207, 'list'),\n (1201, 'run'),\n (1198, 'possible'),\n (1195, 'jesus'),\n (1192, 'far'),\n (1192, 'actually'),\n (1178, 'though'),\n (1178, 'either'),\n (1165, 'inc'),\n (1161, 'second'),\n (1161, 'real'),\n (1157, 'support'),\n (1153, 'hard'),\n (1146, 'technology'),\n (1142, 'center'),\n (1140, 'game'),\n (1136, 'rather'),\n (1131, 'card'),\n (1130, 'old'),\n (1129, 'life'),\n (1121, '16'),\n (1120, 'she'),\n (1120, 'nothing'),\n (1112, 'next'),\n (1110, 'mean'),\n (1107, 'team'),\n (1098, 'line'),\n (1095, 'youre'),\n (1093, 'problems'),\n (1090, 'chip'),\n (1089, 'let'),\n (1081, 'end'),\n (1079, 'window'),\n (1069, 'internet'),\n (1068, 'her'),\n (1068, 'call'),\n (1065, 'called'),\n (1063, 'post'),\n (1062, 'keep')]"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "",
"prompt_number": 80,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
}
],
"metadata": {}
}
],
"metadata": {
"name": "",
"signature": "sha256:aed557e5d625ae6677ed4ef53e6d0e2321c768b19b3f442555c956eaa6825d92"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment