Skip to content

Instantly share code, notes, and snippets.

@mr1azl
Forked from langmore/gist:6820351
Last active August 29, 2015 14:10
Show Gist options
  • Save mr1azl/231d413c7e26eb15709e to your computer and use it in GitHub Desktop.
Save mr1azl/231d413c7e26eb15709e to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "filter_with_meta_ian"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": "# How to use metadata to do LDA on subsets\n\nAlong the way we introduce two new classes.\n\n1. `streamers.TextFileStreamer` Provides ways to stream information (*text*, *doc_id*, *tokens*, etc...) from a source of text files.\n2. `gensim_helpers.SvmLightPlusCorpus` extends `gensim.corpora.SvmLightCorpus`, providing ways to filter the corpus using `doc_id`."
},
{
"cell_type": "markdown",
"metadata": {},
"source": "---\n\n## Set up\n\n---"
},
{
"cell_type": "code",
"collapsed": false,
"input": "import os\n\nimport pandas as pd\nimport matplotlib.pylab as plt\n\nfrom gensim import corpora, models, similarities\nimport gensim\n\nfrom declass.utils import text_processors, filefilter, gensim_helpers,\\\n nlp, topic_seek, common, streamers\n\nfrom datetime import date\nfrom numpy import datetime64",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Every time you change the source code you need to reload it...so I keep this cell handy\nreload(text_processors)\nreload(filefilter)\nreload(gensim_helpers)\nreload(topic_seek)\nreload(streamers)",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 40,
"text": "<module 'declass.utils.streamers' from '/home/langmore/lib/declass/declass/utils/streamers.pyc'>"
}
],
"prompt_number": 40
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Set paths\n\n# I use environment variables to set my base paths\nDATA = os.environ['DATA'] \nME = os.environ['ME'] \nMYDATA = os.path.join(DATA, ME, 'ddrs-01') \nRAW = os.path.join(MYDATA, 'raw') \nPROCESSED = os.path.join(MYDATA, 'processed')\n\n# You only need to set these paths below...any way you want...\nmetafile_path = os.path.join(RAW, 'meta', 'ddrs_meta.csv')\ntext_base_path = os.path.join(RAW, 'ddrs_nofoot') # Read files made with the \"nofoot\" option\ncorpus_path = os.path.join(PROCESSED, 'corpus', 'ddrs-gensim.svmlight')\ndictionary_path = os.path.join(PROCESSED, 'dict', 'ddrs-basic.dict')",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 18
},
{
"cell_type": "markdown",
"metadata": {},
"source": "## Load the `meta`"
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Read meta and create some columns\nmeta = pd.read_csv(metafile_path, sep='\\t', quotechar='|', na_values=['None'])\nmeta = meta.rename(columns={name: name.lower() for name in meta.columns})\nmeta = meta.set_index('id')\n\nmeta['written_year'] = meta.written.str[0:4].astype('float')\nmeta['released_year'] = meta.released.str[0:4].astype('float')\nmeta['is_sanitized'] = (meta.sanitation == 'Sanitized').astype('int')\n\nmeta['written'] = pd.to_datetime(meta.written)\nmeta['released'] = pd.to_datetime(meta.released)",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Sort by the date document was written\nmeta = meta.sort_index(by='written')",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Make a dictionary mapping president names to start/end dates of their term\nadmins = { \n \"Roosevelt\" : (date(1933, 3, 4), date(1949, 1, 20)), \n \"Truman\": (date(1949, 1, 20), date(1953, 1, 20)), \n \"Eisenhower\" : (date(1953, 1, 20), date(1961, 1, 20)), \n \"Kennedy\" : (date(1961, 1, 20), date(1963, 11, 22)), \n \"Johnson\" : (date(1963, 11, 22), date(1969, 1, 20)), \n \"Nixon\": (date(1969, 1, 20), date(1974, 8, 9)), \n \"Ford\" : (date(1974, 8, 9), date(1977, 1, 20)), \n \"Carter\": (date(1977, 1, 20), date(1981, 1, 20)), \n \"Reagan\" : (date(1981, 1, 20), date(1989, 1, 20)), \n \"H. W. Bush\" : (date(1989, 1, 20), date(1993, 1, 20)), \n \"Clinton\" : (date(1993, 1, 20), date(2001, 1, 20)), \n \"W. Bush\" : (date(2001, 1, 20), date(2009, 1, 20)), \n \"Obama\": (date(2009, 1, 20), date(2017, 1, 20)) \n}\n\nfunc = lambda d: pd.Timestamp(datetime64(d))\n\nadmins = {name: (func(start), func(end)) for name, (start, end) in admins.iteritems()}",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Add fields corresponding to administration to data frames\ndef get_admin(np_datetime, pres_dict):\n \"\"\"\n Return the president's name from pres_dict corresponding to np_datetime.\n \"\"\"\n this_pres = 'unknown'\n for pres, (start, end) in pres_dict.iteritems():\n if start <= np_datetime <= end:\n this_pres = pres\n break\n \n return this_pres\n\nmeta['written_admin'] = meta.written.apply(lambda d: get_admin(d, admins))\nmeta['released_admin'] = meta.released.apply(lambda d: get_admin(d, admins))",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Plot the size of data within each adminstration\nmeta.groupby('written_admin', sort=False).size().plot(kind='barh')",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"text": "<matplotlib.axes.AxesSubplot at 0x55281d0>"
},
{
"metadata": {},
"output_type": "display_data",
"png": "iVBORw0KGgoAAAANSUhEUgAAAbkAAAD5CAYAAAC6e0vwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xt8DOf+B/DPirhmBS2pSyskRNhLNiKSkHSJS5W4NrTa\n6FZR11KOo077q+SnLinqdji/HkVU9VRKFS3qliWSuOSyCeIuQUOIBLkIkezz+yPNnERuG53JzKzv\n+/WalzyzM7OffbBP5nlm5lEwxhgIIYQQK1RH7ACEEEKIUKiRI4QQYrWokSOEEGK1qJEjhBBitaiR\nI4QQYrWokSOEEGK16oodwNooFAqxIxBCiCwJcUcbnckJgDEm+WX+/PmiZ7CGjJSTckp9kUtOoVAj\n94JKTU0VO0K15JARoJx8o5z8kktOoVAjRwghxGpRI/eCMhgMYkeolhwyApSTb5STX3LJKRQFE7Iz\n9AWkUCgE7V8mhBBrJNR3J53JvaCMRqPYEaolh4wA5eQb5eSXXHIKRRaN3CeffIJVq1Zx5QEDBmDC\nhAlcefbs2VixYkWl+zs6OkKj0UCn00Gj0WD37t3PlSM4OBjLly9/rn0JIYTUPlk0cr169UJ0dDQA\nwGw2IzMzE8nJydzrMTEx6NmzZ6X7KxQKGI1GJCQkYPv27fj444+fK4c13QOn1+vFjlAtOWQEKCff\nKCe/5JJTKLJo5Ly9vRETEwMAOHfuHFQqFZRKJR48eIAnT57g/PnzcHd3r/IYJX29Dx8+RPPmzQEU\nX1qrVqu5bZYtW4aQkBAAwOrVq9G1a1dotVqMGTOG2yY5ORm9e/eGk5MT1qxZw+vnJIQQwi9ZPPGk\ndevWqFu3Lm7evImYmBh4e3sjLS0NMTExaNKkCdRqNerWrfyjMMbQu3dvMMZw7do1/PTTTxVup1Ao\nuLO10NBQpKamwtbWFtnZ2dxxLly4AKPRiOzsbLi4uGDKlCmwsbEpcxyDwQBHR0cAQNOmTeHm5sb9\nNlXSPy52uWSdVPJUVH42q9h5KiubTCbMnDlTMnkqK1N9Un1KIU9J2Wg0IiwsDAC470shyObqyvfe\new8BAQHYt28fZs2ahbS0NERHR8Pe3h5ZWVlYtGhRpfu2b98ecXFxaN68Oa5duwZ/f3+cO3cOd+/e\nRUBAAM6cOQMAWL58OXJzczF//nwMHDgQdnZ2GDZsGIYNG4bGjRsjJCQE9erVw7x58wAAXbp0waFD\nh9C6dWvuveRydaXRaOT+4UmVHDIClJNvlJNfcsn5wl9d2bNnT0RFReHMmTNQq9Xw8vJCdHQ0oqOj\n4ePjY/FxOnToAAcHByQnJ6Nu3bowm83ca/n5+dzPv/32G6ZOnYr4+Hh0794dRUVFAIB69epx29jY\n2KCwsJCHT1f75PCPXg4ZAcrJN8rJL7nkFIpsGjkfHx/8+uuveOmll6BQKNCsWTM8ePAAMTExFjVy\nJb8h3L17FykpKWjXrh0cHBxw9+5dZGVl4cmTJ/j111+53yZu3LgBvV6PJUuW4OHDh8jNzZXFGRoh\nhJD/kk0jp1KpkJmZCS8vL26dRqNB06ZN0bx5c9y6dQuDBg2qdP/evXtDp9OhT58+CA0NRYsWLWBr\na4svvvgCnp6e6N+/P7p06QIAKCoqQlBQEDQaDdzd3TFjxgzY29uXGbOTu9LjCVIlh4wA5eQb5eSX\nXHIKRRYXngDFXYMPHz4ss27Tpk3cz61bt8Zvv/1W4b4pKSmVHnf69OmYPn16ufWRkZHl1s2fP79M\nuWQsjxBCiDTJ5sITuZDLhSeEECIlL/yFJ4QQQkhNUSP3gpJDP70cMgKUk2+Uk19yySkUauQIIYRY\nLRqT4xmNyRFCSM3RmJyMNGnSXOwIhBBCQI2cIHJy7osdoVpy6KeXQ0aAcvKNcvJLLjmFQo0cIYQQ\nqyXrMbn09HTMnDkTsbGxaNq0KRwcHLBy5UqMGDECZ86cQWxsLLZs2VJmwtVnPXz4ED/88AMmT57M\nS6aSJ6LIuFoJIaTWCTUmJ9tGjjEGHx8ffPDBB5g4cSKA4ieQPHz4EJMnT7b4aSSpqallZiL4q6iR\nI4SQmqMLT54RERGBevXqcQ0cAKjVarRt25YrG41GBAQEAACCg4Mxbty4chOefvrpp7h69Sp0Oh3m\nzp0LAJgzZw7UajU0Gg3Cw8O5Y+n1egQGBsLV1RXvvfdebX1UQcihn14OGQHKyTfKyS+55BSKbJ5d\n+ayzZ8+iW7duNdrn0qVLiIiIKDPhaWhoKM6dO4eEhAQAwI4dO5CYmIikpCRkZGSge/fu8PPzAwCY\nTCYkJyejVatW3NQ/PXv2rPC9goODAUh70lQp5ZFz2WQySSqP3MtUny9GfRpp0tSqrVmzBikpKfj6\n66/LrC/d/Wg0GrF8+XLs2bOn0glPCwoKynRXzpo1CxqNBgaDAQAwduxYBAYGokmTJli4cCEOHDgA\nAJgyZQp69uyJd999t8z7U3clIYTUHHVXPqNr166Ii4ur0T6WTnj6bEWXNFz169e3aH9CCCHSINtG\nrk+fPnjy5AnWr1/PrUtKSsLNmzcr3L6y3xCUSiVycnK4sq+vL7Zt2waz2YyMjAwcO3YMnp6eVndm\nVtJtIGVyyAhQTr5RTn7JJadQZNvIAcDOnTtx6NAhODs7Q6VS4bPPPkOrVq3KTGxa8nNlE56+9NJL\n6NmzJ9RqNebOnYvhw4dDo9FAq9XC398fS5cuRcuWLSvc31omUCWEEGsl2zE5qaIxOUIIqTkak5MR\npbKZ2BEIIYSAGjlBZGdniR2hWnLop5dDRoBy8o1y8ksuOYVCjRwhhBCrRWNyPKP55AghpOZoTI4Q\nQgipIWrkBFByu0HpRWoTqcqhn14OGQHKyTfKyS+55BSKbJ9dKW3lT7lzcuieOkIIqW21fiZnZ2dX\nphwWFobp06dXuY9Op0NiYiIAoLCwEHZ2dti6dSv3erdu3WAymSrcNzU1FQ0bNoROp4Obmxt69uyJ\nS5cuPVd2vV5f40eJSVXJA1OlTA4ZAcrJN8rJL7nkFEqtN3LP89SQXr16ITo6GgCQmJgIFxcXrpyX\nl4dr165Bq9VWur+zszMSEhJgMpnw/vvvY9GiRbxkJ4QQIm2ij8lZcjWNj48P16jFxMRg0qRJ3Jnb\nqVOn0K1bN4sboIcPH6J58+LxsWfPIgcPHoyjR4/CbDbDYDBwc8qVnln8p59+Qo8ePeDi4oLjx49b\n/DmlRg799HLICFBOvlFOfsklp1BqfUwuPz8fOp2OK2dlZWHo0KFV7uPj44PPP/8cABAdHY358+fj\nP//5D3JzcxEdHV3pnG4lSiZFzcnJwaNHj3Dq1CkAFZ9VKhQKJCQk4NatW9z0O9nZ2dw2RUVFOHny\nJPbt24eQkBAcPHjQ8g9PCCGkVtV6I9ewYUNuglIA2Lx5M2JjY6vcp127digoKMCdO3dw4cIFuLi4\noHv37jh58iRiYmLw8ccfV7m/k5MT957h4eGYMGEC9u3bV+lZpJOTE65du4aPP/4YgwYNQv/+/bnX\nRowYAQBwd3dHampqJe9oAOD4589NAbhxr0hp0kKpl0smVpRKnqrKJaSSh+pT+DLVpzwmTQWrZXZ2\ndmXKmzZtYtOmTat2v1GjRrHVq1ezN998kzHG2M8//8yCg4NZixYtWHZ2dqX7paSkMJVKxZUfPXrE\nGjVqxBhjbMuWLWzKlCnca3379mVHjx5ljDGWm5vLduzYwYYNG8bGjRvHGGNMr9ezuLg4xhhjGRkZ\nzNHRsdz7AWAAq2Cp9aomhBDZEOo7UvQxOUv5+Phg5cqV8PHxAQB4e3vju+++Q6tWraBUKi0+zvHj\nx+Hs7Ayg+LcHk8kExhhu3rzJdWNmZmaiqKgII0aMwIIFC8qceVqLZ3/DkyI5ZAQoJ98oJ7/kklMo\ntd5dWdk4GAB88803AICPPvqo3H4+Pj6YNWsWvL29AQCvvPIKzGYz1+jt2bMHsbGxCAkJKbdvyZgc\nYwz169fHt99+C6D4qs327dujS5cucHV1Rbdu3QAAaWlp+OCDD2A2mwEAS5YsseizEEIIkRZ6diXP\nihu+iqqUnmlJCCGVoWdXEkIIITVEjZwgFOUWqU2kKod+ejlkBCgn3ygnv+SSUyj07EoBULckIYRI\nA43J8YzmkyOEkJqjMTlCCCGkhqiRE0BF88lJeZHaXHcl5DKWQDn5RTn5JZecQqFGThBMBksE93NO\nzn2B6oEQQsQluzE5GxsbaDQaFBUVwdnZGd999125OerEVPl9clJG44iEEHHRmNyfGjVqhISEBCQl\nJaFJkybcU1IIIYSQZ8mukSvN29sbV69eBVD86K6BAwfCw8MDfn5+uHjxIoDix315eXnB3d0d/fr1\nw927dwEAGRkZ6NevH1QqFSZMmABHR0dkZWUBAIYPHw4PDw+oVCqsX7+eez87Ozt8/vnncHNzg7e3\nN3cseTKKHaBachlLoJz8opz8kktOoci2kSsqKsKBAwegUqkAABMnTsSaNWsQGxuLpUuXYsqUKQAA\nX19fnDhxAvHx8Rg9ejS++uorAEBISAj69u2Ls2fP4q233sKNGze4Y2/cuBGxsbE4ffo0Vq9ejfv3\ni8esHj16BG9vb5hMJvj5+ZVpAAkhhEiP7Mbk6tatC7VajbS0NDg6OuLEiRN49OgRWrZsCRcXF267\ngoICnDt3DmfOnMHs2bORnp6OgoICdOjQAXv37oVOp8Mvv/yCdu3aAQBeeuklXL58Gc2bN0dwcDB+\n+eUXAEBqaioOHDgAT09PNGjQAI8fPwZQPC/dwYMHyzV0NCZHCCE1J9SYnOyeeFIy6Wp+fj4GDBiA\nXbt2oW/fvmjatGmFU+JMnz4df/vb3zB48GAcPXoUwcHB3GsVVajRaMThw4dx4sQJNGjQAL179+Ya\nNltbW267OnXqoLCwsJKUBpSfNFVf8g5//im18p8lCU2qSGUqU9l6y0ZrnTT1ryo96WpCQgJzdXVl\nZrOZ+fj4sJ9++okxxpjZbGaJiYmMMcZ0Oh030anBYGB6vZ4xxtjUqVNZaGgoY4yx33//nSkUCpaZ\nmcl27drFAgICGGOMnT9/njVo0ICbSLX0e//000/MYDCUy4dKJ02V2hIh+QldIyIixI5gEcrJL8rJ\nL7nkFOp7SHZjcqXncHNzc4OzszPCw8OxdetWbNiwAW5ublCpVNi9ezcAIDg4GIGBgfDw8ECLFi24\n/efPn48DBw5ArVZj+/bteOWVV6BUKvHGG2+gsLAQXbp0wbx587j5655979Lz4BFCCJEm2Y3J8aWg\noAA2NjawsbFBTEwMpk6divj4+L98XBqTI4SQmqMxOZ7duHEDo0aNgtlsRr169ehKSUIIsUKy667k\ni7OzM+Lj42EymXDq1Cl069ZN7Ei1zCh2gGqVDFJLHeXkF+Xkl1xyCuWFPZMTlrzG6qQ2oSshhPDl\nhR2TEwrNJ0cIITVHz64khBBCaogauReUHPrp5ZARoJx8o5z8kktOodCYnADo/rmylMpmyM7OEjsG\nIeQFRGNyPJPnfXJCo3FKQkjVaEyOEEIIqSGraeTS09Px9ttvw9nZGR4eHhg0aBAuX75s8f6LFi0S\nMJ0UGcUOUC25jCVQTn5RTn7JJadQrKKRY4xh+PDh6NOnD65cuYLY2FgsXrwYd+7csWh/s9mMxYsX\n1/h9zWZzjfchhBBSe6xiTO7IkSMICQnB0aNHy6zPy8vD0KFDcf/+fTx9+hRffvklhgwZgtTUVAwY\nMABeXl6Ii4uDp6cnvvvuO6jVaqhUKmzZsgXff/891qxZg4KCAvTo0QPr1q1DnTp1YGdnh0mTJuHQ\noUNYt24dfHx8yrwnjclVhMbkCCFVE2pMzioaudWrVyM1NRVff/11mfVFRUV49OgRlEol7t27B29v\nb1y+fBmpqalwcnJCTEwMPD09AQBKpRI5OTkAgPPnz2Pu3LnYuXMnbGxsMGXKFHh7eyMoKAh16tRB\neHg43nrrrQqzUCNXEWrkCCFVE/UBzcePH0dISAhSU1O5iUIVCgWuXbvGe6DnUdkl+2azGfPmzUNk\nZCTq1KmDW7du4e7duwCAdu3acQ3csw4fPoy4uDh4eHgAAPLz8/HKK68AAGxsbDBy5MhqEhkg/UlT\nS9bV5vvVfFJFLqkEJnmsrGwymTBz5kzJ5KmsTPVJ9SmFPCVlo5QmTe3UqRPbu3cvS09PZxkZGdwi\nFYcPH2Z+fn7l1m/atImNHj2aFRYWMsYYc3R0ZNevX2cpKSlMpVKV2bb0hKhr1qxh8+bNq/C9Sm9X\nEchy0lShl+ebDFEukz1STn5RTn7JJefzfk9Ux6ILT5o2bYqBAwfCwcEBL7/8MrdIRZ8+ffDkyZMy\n0+UkJSXhxo0baNmyJWxsbBAREYHr169XegxbW1vuLNXf3x/bt29HRkYGACArKws3btwQ9kPUOr3Y\nAapV8tuf1FFOflFOfsklp1AsauR69+6NOXPmICYmBvHx8dwiJTt37sShQ4fg7OwMlUqFzz77DG++\n+SZiY2Oh0WiwZcsWuLq6cts/28U5ceJEaDQaBAUFwdXVFV9++SX69+8PrVaL/v37Iz09vcL9CCGE\nSJdFF57o9foKv9wjIiIECSVn8rnwxIjaO5t7vgFlo9Eoi99CKSe/KCe/5JJT1AtPSg+wEkIIIXJR\n5Zncli1bEBQUhOXLl5c5k2OMQaFQYNasWbUSUk6oO7M8ekAzIaQ6opzJPXr0CACQk5NTYSNHKibE\nXxQhhJCas4qbwaVELjODy6GfXg4ZAcrJN8rJL7nkFHVM7tq1a1izZk25m8F3797NeyBCCCGELxad\nyWk0GowfPx4qlQp16hTfdaBQKPD6668LHlBuxOrGpXEvQoicifrsSk9PT5w6dYr3N7dG4t1CII9u\nUkIIqYiok6ZOnz4dwcHBkr4ZnNSMHG4LkUNGgHLyjXLySy45hWLRmNy5c+ewZcsWREREcN2VgDxv\nBrexsYFGo+HKu3btwmuvvVbj46SmpiIgIABnzpzhMx4hhBAeWdRd6eTkhPPnz6NevXq1kUlQpafU\nqYmioiLY2Nhw5coaOequJISQmhP16kq1Wo379+/DwcGB9wBSYDKZMGnSJOTn58PJyQkbN25E06ZN\nodfrodPpcPz4cYwZMwZ+fn4YN24cFAoF+vfvL3ZsQggh1bCokbt//z46d+6M7t27o379+gDkewtB\nfn4+dDodAKBDhw7YsWMHxo4di7Vr18LX1xfz589HSEgIVqxYAYVCgadPn+L06dMAiq8yXbduHXr1\n6oW///3vVbyLAbU/n9yfJQvncypZJ6X5pZ4t03xdVJ9SyFNZmerTiuaTi4iIqHCRo2fng3vw4AF7\n7bXXuPLVq1eZu7s7Y4wxvV7Pjh07xhhj7P79+2W2S0pKKjcnHWNizidXs7mY5PD3J4eMjFFOvlFO\nfsklZ02/wyxl0ZlcSSv8ImDP9Ak3btzYou3kRg5/p3LICFBOvlFOfsklp1CqvIXAzs4OSqWywqVJ\nkya1lVFQ9vb2aNasGY4fPw6g+KHUpf9RlDRmTZs2RdOmTREVFQUA2Lp1a61nJYQQUjNVNnK5ubnI\nycnBjBkzEBoairS0NKSlpeGrr77CjBkzaisjryp6IsnmzZsxZ84caLVaJCUl4Ysvvqhw+02bNmHq\n1KncmJ6cH1JdejxBquSQEaCcfKOc/JJLTqFY1F25e/duJCUlceXJkydDo9FgwYIFggUTSnZ2drl1\nWq0WMTEx5dY/ex+gu7s7TCYTVw4NDeU/ICGEEN5YdJ+ct7c3pk6dinfeeQcA8OOPP2Lt2rWIjo4W\nPKDc0H1yhBBSc6I+1uuHH35AeHg4HBwc4ODggPDwcPzwww+8h7EeilpflMpmtfPRCCFERixq5Nq3\nb4/du3fj3r17uHfvHnbt2iXsfQ0yxxir9aWmMxDIoZ9eDhkBysk3yskvueQUikVjcvn5+diwYQOS\nk5Px+PFjbv3GjRsFC0YIIYT8VRaNyb311ltwdXXF1q1bMX/+fHz//fdwdXXF6tWrayOjrMhlZnBC\nCJESUeeTc3Nzg8lkgkajQVJSEp4+fYpevXrh5MmTvAeSOznfViBlNCksIdZN1AtPSmYfsLe3x5kz\nZ/DgwQNkZGTwHsZ6MBksERLIYHnGnJz7NfsrqEVyGfOgnPyinPJg0ZjchAkTkJWVhS+//BJDhgxB\nbm6uLO+RI4QQ8mKxqLuyOps3b8b777/PRx5e1KlTB7NmzcKyZcsAAMuWLUNeXh7mz5+Pb775Bo0a\nNUJQUJAg7y3efXLWjsY6CbFmonZXVmflypV8HIY39erVw86dO5GZmQmg7DjZRx99JFgDRwghRFp4\naeSkxtbWFhMnTsSKFSvKvRYcHIzly5ejqKgInp6eOHr0KABg3rx5+PzzzwEAX3/9NdRqNdRqNVat\nWgWgeCZwV1dXTJw4ESqVCgMGDChzO4X8GMUOYAGj2AEsIpcxD8rJL8opD1bZyAHAlClTsHXr1nLP\nqiw5q7OxsUFYWBgmT56MQ4cO4ffff0dwcDDi4uIQFhaGU6dO4cSJE1i/fj33vMorV65g2rRpOHv2\nLJo2bYodO3ZU8u4GAMF/LitR9svaSOW/UDYajWX+00qhXPp5plLII/cy1Se/ZanWp9FohMFggMFg\nQHBwMITCy5icTqdDQkICH3l4oVQqkZOTg/nz58PW1hYNGzZEbm4uN+u3nZ0dZs+eDQBYtGgRFixY\ngBMnTkCr1WLVqlW4f/8+V+lffPEFWrRogSFDhqBfv364dOkSAOCrr77C06dP8dlnn5V5bxqTEwqN\nyRFizSQ9JtezZ08+DsO7mTNnYsOGDcjLy6t0mzNnzqBZs2a4c+cOgPIVzRjjzv7q16/PrbexsUFh\nYaFAyQkhhPDBokbu8ePH2Lp1KxYuXIiQkBCEhITgf//3f7nX//nPfwoW8K9o1qwZRo0ahQ0bNnAN\nVekG7Oeff8aDBw9w9OhRTJ8+HQ8fPoSvry9++eUX5OfnIy8vD7/88gt8fX2t8CzCKHYACxjFDmCR\n0t0xUkY5+UU55cGiRm7o0KHYvXs3bG1tYWdnBzs7OzRu3FjobM+t9NWUs2fPxr1798q8plAokJmZ\niXnz5uHbb79Fx44dMW3aNMyYMQM6nQ4GgwGenp7w8vLChAkToNVqyx23ojIhhBBpsWhMTqVS4ezZ\ns7WRR/ZoTE4oNCZHiDUTdUzOx8enzMzghBBCiBxY1MhFRkaiW7du6NSpE3f/mEajETqbjNX+pKnW\nvkh5Uli5jHlQTn5RTnmw6NmV+/btEzqHVZFDt5rRaIRerxc7RpXkkJEQIm0W3ycXGRmJK1eu4IMP\nPkBGRgZyc3PRvn17ofPJDs0nRwghNSfqfHIlTwK5ePEiLl26hLS0NIwaNQpRUVG8B5I7auQIIaTm\nhPrutKi7cufOnUhISEC3bt0AAG3atEFOTg7vYawF3VpACCHliTH5sUUXntSvXx916vx306qeIEIA\n8Scbtb5JU6W9UE7KKeVFOjnFmPzYokYuMDAQH330ER48eIB///vf8Pf3x/jx44XOVo6dnV2lrxmN\nRgQEBNRiGrnTix3AAnqxA1hIL3YAC+nFDmAhvdgBLKQXO4CF9GIHEJVF3ZVz5szBgQMHoFQqcenS\nJSxYsAD9+vUTOls51A1ICCGkJiw6k5s7dy769++PZcuWYdmyZejXrx/mzp0rdLZKzZkzh7tXLzw8\nnFufm5uLwMBAuLq64r333uPWOzo6Ijg4GN26dYNGo8HFixcBAEePHoVOp4NOp4O7uzvy8vLAGKvw\n+CWXs1d0fHkyih3AAkaxA1jIKHYACxnFDmAho9gBLGQUO4CFjGIHEBezgJubW7l1KpXKkl15ZWdn\nx3bs2MH69evHzGYzu3PnDnvttdfY7du3WUREBLO3t2dpaWnMbDYzb29vFhUVxRhjzNHRkf3zn/9k\njDG2bt06Nn78eMYYYwEBASw6OpoxxlheXh4rLCxk27dvt/j4x48fL5cRAAOYDJYICWSwhoyUk3JK\nfZFSTlT6/W5hc1RjVXZX/utf/8K6detw9epVqNVqbn1OTo5o0+scP34cY8aMgUKhQMuWLfH666/j\n9OnTaNKkCTw9PdG6dWsAgJubG1JTU+Hj4wMAGDFiBADA3d0dP//8M4DiKYI++eQTvPvuuxgxYgTa\ntGmDqKioGh2/4nowAHD88+emANzw335x459/Urn6sl5ieaoqo5rXpVDWSyxPVWVU87oUynqJ5amq\njGper63yf3vFjEYjwsLCABT3tgmmqhbwwYMHLCUlhb399tssNTWVpaSksJSUFHbv3j1BWtzq2NnZ\nsVmzZrGNGzdy64KCgtiePXuY0WhkgwcP5tZPmzaNbd68mTFWfCaXmZnJGGPs9OnTTK/Xc9udPXuW\nhYaGsnbt2rELFy6wTz75xOLjh4WFlcsIyOVMjhZaaKGlthdU+v1eTXP03Kock7O3t4ejoyNsbGzQ\nrl07ODo6wtHRES+99BKCgoKEa3mr0KtXL2zbtg1msxkZGRk4duwYPD09UVxHNXP16lV07doVf//7\n39G9e3dcuHABvr6+vB1f2oxiB7CAUewAFjKKHcBCRrEDWMgodgALGcUOYCGj2AFEZdHVlefOnStT\nLiwsRFxcnCCBKlNYWIj69etj+PDhiImJgVarhUKhwNKlS9GyZUucP3/e4qsvS7ZbtWoVIiIiUKdO\nHahUKrz55puwtbW1+Ph0tSchhEhblY/1WrRoERYvXoz8/Hw0bNiQW29ra4uJEydiyZIltRISABIT\nE/HRRx/hxIkTtfaez4PmkyOEkMpU/uguUZ9d+emnn9Zqg/as//u//8OaNWuwatUq9O3bV7QclqBG\njhBCKiOxRu7ChQvo3Lkz4uLiKuyac3d35z2Q3FEXJiGEVKyqZ1eK0shNmDAB69evh16vr/DLOyIi\ngvdAcieXWQjkMFebHDIClJNvlJNfcskpWnel2WxGTEyMaPfFyY1cGjlCCJESUcfk3NzcYDKZeH9z\na0SNHCGE1JxQ350WPbuyb9++2L59O315WxGj0Sh2hGrJISNAOflGOfkll5xCseg+uW+++QZff/01\nbGxs0KC0CHljAAAamElEQVRBAwDFrW52drag4eTqRbr4RIxJEAkhxFIWdVe+++67eP311+Hr6wtX\nV9fayAU7Ozvk5uYCAPbu3YtPPvkEhw4dwquvvlor718iODgYSqUSs2fPtmj7F+8WAuqeJYT8daJ2\nV3744YdIT0/Hxx9/jA4dOmDkyJFYuXIl72FKKzkbOnz4MGbMmIH9+/fXegNXOgchhBD5saiR69On\nD/7xj39gwYIFGD9+PE6fPo1//etfQmfDsWPHMHHiRPz2229o3749AOD7779Hjx49oNPpMGnSJJjN\nZgDFZ36ff/453Nzc4O3tjbt37wIADAYDZsyYgZ49e8LJyQk7duzgjr906VJ4enpCq9UiODiYW79w\n4UK4uLjA19eXm3vu2rVr6NatG7fN5cuXy5Tlxyh2gGrJZSyBcvKLcvJLLjmFYlEj5+/vzz0Y2cXF\nBbGxsdyXv1AeP36M4cOHY9euXejUqRMA4Pz58wgPD0d0dDQSEhJQp04dbN26FQDw6NEjeHt7w2Qy\nwc/PD+vXr+eOlZ6ejqioKPz666/49NNPAQAHDhzAlStXcOrUKSQkJCAuLg6RkZGIi4vDtm3bkJiY\niL179+L06dNQKBTo0KED7O3tkZiYCADYtGkTxo0bJ2gdEEII+WssauQ0Gg1sbW1x9uxZJCUl4ezZ\ns8jPzxc0WL169dCzZ098++233LrDhw8jLi4OHh4e0Ol0OHLkCFJSUrjtBw0aBADo1q0bUlNTARR3\nNw4bNgwA4Orqijt37gAobuQOHDgAnU6Hbt264eLFi7h8+TKOHz+OESNGoEGDBlAqlRgyZAjXTzx+\n/Hhs2rQJZrMZ4eHhGDNmjKB1ICy92AGqJYcbWAHKyTfKyS+55BSKRVdXrlixAkDxZKlhYWH44IMP\nkJ6ejidPnggWrE6dOggPD0efPn2wePFizJs3DwDw/vvvY9GiReW2t7W1LbNvYWEhV65Xrx73c+mB\nzXnz5mHixIlljrNq1aoy25T+ecSIEQgJCUGfPn3g4eGBZs2aVZLegBdn0tSyT1Qo6RqhMpWpTOWq\nykYpTJpaYvXq1SwwMJB16NCB+fv7s+DgYHb48OG/NpNdNezs7BhjjGVlZbGuXbuyDRs2sOTkZNax\nY0d29+5dxhhjmZmZ7Pr162W2Z4yxn376iRkMBsYYYwaDgW3fvr3ccQ8cOMB69OjBcnNzGWOM/fHH\nH+zu3bssPj6eaTQalp+fz7Kzs1nHjh3Z8uXLuf2nT5/OWrduzfbv319hbkAuk6ZGCD4J4l8VEREh\n2LH5RDn5RTn5JZecQn2XWHQm9/jxY8yePRvu7u5lzpiEVHJVY7NmzbB//374+flh9erV+PLLL9G/\nf3+YzWbY2tpi3bp1eO2118pcBalQKMqVn/25X79+OH/+PLy9vQEASqUS33//PXQ6HUaPHg2tVouW\nLVvC09OzTK4xY8Zg586d6N+/v2CfnRBCCD8suk+O/NeyZcuQk5ODkJCQCl+n++QIIaTmhLpPzqIz\nOVJs+PDhSElJwZEjR8SOQgghxAIWXV1Jiu3cuRMmkwnNmzcXOwoPjGIHqFbJILXUUU5+UU5+ySWn\nUOhMThAvzlNSlMrKrjAlhBDx0Zgcz2iqHUIIqTlRn11JCCGEyBE1ci8oOfTTyyEjQDn5Rjn5JZec\nQqExOQHQzAWE1BzNTUiEQGNyPHvx7pMjhC80nv0iozE5QgghpIYEbeRsbGyg0+m45auvvgIATJgw\nAefPn+ftfYxGIwICAng73ovBKHYACxjFDmAho9gBLGQUO4BF5DKGRDnlQdAxuUaNGiEhIaHc+tJz\nvclZUVERbGxsxI5BCCGkEqJ0V+r1esTHx8NsNsNgMECtVkOj0WDlypUAgKtXr2LgwIHw8PCAn58f\nN0FrVbN85+bmIjAwEK6urnjvvfe49YcPH4a7uzs0Gg0+/PBDFBQU4PTp0xg5ciQAYNeuXWjUqBEK\nCwvx+PFjODk5VZth0qRJ8PLywty5c2ulvoShFzuABfRiB7CQXuwAFtKLHcAiJdOySB3llAlB5jb4\nk42NDXNzc+OW8PBwxhhjer2excXFsdjYWNavXz9u+4cPHzLGGOvTpw+7fPkyY4yxEydOsD59+jDG\nGHv//ffZqFGjGGOMJScnM2dnZ8ZY8VQS9vb2LC0tjZnNZubt7c2ioqJYfn4+e/XVV7ljjR07lq1c\nuZIVFhayDh06MMYYmz17NvP09GRRUVHMaDSyMWPGVJshICCAmc3mCj8zZDPVDi20SG0BT988RI6E\n+vsXtLuyYcOGFXZXlnBycsK1a9fw8ccfY9CgQejfvz9yc3MRExODwMBAbruCggIAlc/yDQCenp5o\n3bo1AMDNzQ0pKSlo3Lgx2rdvD2dnZwDFE66uXbsWM2bMgJOTEy5cuIDTp09j1qxZOHbsGIqKiuDr\n64u8vDxER0dXmiEwMLCa2wQMkP6kqSXrpJKnovKzWcXOU1nZBGCmhPJUVi75WSp5ypeNRiNMJhNm\nzpzJlQFpTPL5bLn0WJcU8lRWlmp9SmrS1OdVeiLT0krO5BhjLDc3l+3YsYMNGzaMjRs3jmVnZ7NW\nrVpVuF9lE6BGRESwwYMHc+unTZvGwsLCWGJiIvPz8+PWHzp0iI0YMYIxxtiCBQvY8uXLmb+/P8vI\nyGBvvvkme+ONN9jZs2fZw4cPLc7wLEAuZ3IREshgDRkpJ38LuP/PckA5+SVUcyTaLQSMMWRmZqKo\nqAgjRozAggULkJCQAKVSifbt22P79u3cdklJSTU+vkKhgIuLC1JTU3H16lUAwJYtW7jfKHx9fbFy\n5Ur4+Pjg5ZdfRmZmJi5duoSuXbuiSZMmvGSQNr3YASygFzuAhfRiB7CQXuwAFpHLGBLllAdBG7n8\n/PwytxD84x//4F5TKBRIS0tD7969odPpEBQUhMWLFwMAtm7dig0bNsDNzQ0qlQq7d+8us191P5eo\nX78+Nm3ahMDAQGg0GtStWxeTJk0CUNy9effuXfj5+QEAtFot1Go1t6+lGQghhEgXPfGEZ/J54okR\n0v/N3gjpZwQoJ1+Kn3hhNBplcfZBOflFTzwhhBBCaojO5HhGXZmEPB96QPOLTagzOZqFQAD0ewMh\nhEgDdVe+oOTwPDs5ZAQoJ98oJ7/kklMo1MgRQgixWjQmxzO5jsnReAghRExCjclRI8cz+dxC8Cya\nsJIQIh66hYDwzCh2gGrJZSyBcvKLcvJLLjmFIqurKzMzM9G3b18AQHp6OmxsbNCiRQsoFAqcOnUK\ndevK6uMQQggRmGy7K0NCQqBUKjFr1ixunRQmMaXuSkIIqTm6T64CjDEYDAY0aNAAJpMJPXv2RJMm\nTWBnZ4fZs2cDAFQqFfbu3Quz2Yw33ngD3t7eiI6OhoeHB95//32EhIQgIyMDW7duRffu3XHq1CnM\nnDkTjx8/RsOGDbFp0yZ06tQJYWFh2L17N/Lz83H16lUMHz4coaGhItcAIYSQqsh+TE6hUODWrVuI\niYnB8uXLK3y9xNWrV/G3v/0NFy5cwMWLF7Ft2zZERUVh2bJlWLRoEYDieeoiIyMRHx+PkJCQMg+V\nTkxMRHh4OM6cOYNt27YhLS1N+A8oGKPYAaoll7EEyskvyskvueQUiqzP5EpUP4lpsfbt26Nr164A\ngK5du3LjeyqVCqmpqQCABw8eYOzYsbhy5QoUCgUKCwu5/f39/aFUKgEAXbp0QWpqKtq0aVPBOxkg\nj0lTy5elNKmiXMomk0lSeeRepvp8MerTWEuTpsp6TM7Ozg5nz57F4MGDMXLkSADAwoULUa9ePcyZ\nMwcA0LFjRxw+fBhmsxkBAQE4c+YMAOCDDz7g9ktNTeVeMxgM8PDwwLRp03D9+nXo9XqkpKQgLCwM\ncXFxWLNmDQAgICAAc+bM4abqKUFjcoQQUnN0C4GFHB0dER8fDwCIj49HSkpKjfbPzs5G69atAQCb\nNm2qcltqFAghRNqsopEr3VU5cuRIZGVlQaVSYe3atXBxcalwu2fLJT///e9/x7x58+Du7o6ioiJu\nvUKhqHJ/+TGKHaBaJV0bUkc5+UU5+SWXnEKRbXelVMmnu9KIshNoSq+70iiTyR4pJ78oJ7/kkpMe\n6yUT8mnkniW9Ro4Q8uKg++RkRX7dmEplM7EjEEII76xiTE5qGGOSXyIiIsqUpTgDgVzGEignvygn\nv+SSUyjUyBFCCLFaNCbHM6H6lQkhxJrRfXIyUnK7gaVLkybNxY5MCCFWiRo5QbAaLTk592s9oRz6\n6eWQEaCcfKOc/JJLTqFQI0cIIcRqiT4mZ2NjA41Gg6KiIjg7O+O7776DnZ2dmJHKMRqNWL58Ofbs\n2YOjR4+iXr168Pb2rnDb57tPjsbxCCEvNqsdk2vUqBESEhKQlJSEJk2a4JtvvhE7UpUiIiIQHR0t\ndgxCCCEWEL2RK83b2xtXr14FAJhMJnh5eUGr1WLEiBF48OBBletXr16Nrl27QqvV4p133gEA5OXl\nYdy4cejRowfc3d2xe/du7n2Sk5O599Xr9YiPj690+xLXr1/HN998gxUrVkCn0+H48eOC14lQ5NBP\nL4eMAOXkG+Xkl1xyCkUyjVxRUREOHDgAlUoFABg7diyWLl2KxMREqNVqhISEVLk+NDQUJpMJiYmJ\n3NngwoUL4e/vj5MnT+LIkSOYM2cOHj16hNGjRyM8PBwAcPv2baSnp8Pd3b3S7Uu0a9cOkyZNwqxZ\ns5CQkIBevXpV8mkMAIL/XFai7MOQjRWUS5WMxjL/KKks/bLJZJJUHrmXqT75LUu1Po1GIwwGAwwG\nA4KDgyEU0cfk6tatC7VajbS0NDg6OuLEiRPIycmBRqPB9evXAQDXrl1DYGAgIiIioFary62Pi4vD\nwIEDYWdnh2HDhmHYsGFo3LgxPDw88OTJE9StW/z0svv37+P333+HUqlE//79cfbsWaxatQr37t3D\nggULKt3+9u3b3JhcyTx2s2fPrvDz0JgcIYTUnNU+u7Jhw4ZISEhAfn4+BgwYgF27dsHf37/MNpV9\n8NLrf/vtNxw7dgx79uzBwoULuclRf/75Z3Ts2LHcvi+99BLOnDmD8PDwMuOAFW1/+/bt5/58hBBC\nxCOZ7sqGDRti9erV+Oyzz6BUKtGsWTNuzGvLli3Q6/Vo0qRJhesZY7hx4wb0ej2WLFmChw8fIjc3\nFwMGDMDq1au590hISOB+Hj16NEJDQ5Gdnc11kVa1fQmlUomcnBxB6qA2le5CkCo5ZAQoJ98oJ7/k\nklMoojdypScedXNzg7OzM8LDw7F582bMmTMHWq0WSUlJ+OKLLwCgwvWFhYUICgqCRqOBu7s7ZsyY\nAXt7e/zP//wPnj59Co1GA5VKhfnz53Pv9dZbb2Hbtm0YNWoUt66y7UtPmBoQEICdO3dCp9MhKiqq\nNqqIEELIcxJ9TM7a0JgcIYTUnNXeJ0cIIYQIhRo5QShqtIgxYakc+unlkBGgnHyjnPySS06hiH51\npTWirkdCCJEGGpPjGc0nRwghNUdjcoQQQkgNUSMngJpOmkpLzRcpTTQrlzEPyskvyikP1MgJomaT\npoqzREggw/NnFGOiWUKI/NCYHM+e7z45UnM09kmINaExuUqEhYVh+vTpYscghBAiQbJv5Eo/FozU\nhFHsABYwih3AInIZ86Cc/KKc8iC5Ri41NRVqtZorL1u2DCEhIejduzc+/fRT9OjRAy4uLhVOWPrb\nb7/Bx8cHmZmZMBgMmDFjBnr27AknJyfs2LEDQPE9bHPmzIFarYZGo+HmlZs6dSr27NkDABg+fDg+\n/PBDAMDGjRvx+eef4/r163B1dcXEiROhUqkwYMAAPH78WOjqIIQQ8hdI/mbw0mdqRUVFOHnyJPbt\n24eQkBAcPHiQ68PduXMnVqxYgX379sHe3h4KhQLp6emIiorC+fPnMWTIEIwcORI///wzEhMTkZSU\nhIyMDHTv3h1+fn7w8/NDZGQkAgICkJaWhjt37gAAIiMjMWbMGDDGcOXKFWzbtg3//ve/MXr0aOzY\nsQPvvvtuBakNABz//LkpADcA+j/Lxj//pHL1ZX0Vr/9Z+vO3VL1eL2pZankqKuv1eknlqapcQip5\nqD75LxuNRoSFhQEAHB0dIRgmMSkpKUylUnHlZcuWseDgYKbX61l0dDRjjLH09HTm7OzMGGNs06ZN\nrEuXLszLy4vl5ORw+xkMBvbDDz9wZaVSyRhjbObMmWzTpk3c+qCgILZ7926WlpbGvLy8WHJyMjMY\nDGzYsGHs9u3brHPnziw3N5elpKSwjh07cvuFhoayL7/8slx+AAxgtAi+SO6fLiHkLxDq/7Tkuivr\n1q0Ls9nMlUt3CdarVw8AYGNjg8LCQgDFZ3pOTk7Izc3FxYsXyxyrZHsA3Bnfs1fwMMagUCjQunVr\nPHjwAPv374efnx969eqFbdu2wc7ODo0bNwYA1K9fn9uvdAZ5MoodwAJGsQNYRC5jHpSTX5RTHiTX\nyDk4OODu3bvIysrCkydP8Ouvv1a5PWMM7dq1w/bt2zF27FgkJydXub2vry+2bdsGs9mMjIwMREZG\nwtPTEwDg5eWFlStX4vXXX4evry+WLVsGPz8/3j4bIYSQ2iW5MTlbW1t88cUX8PT0RJs2beDq6gqg\n7MSlJeXS611cXLB161YEBgZyF5BUtP3w4cMRExMDrVYLhUKBpUuXomXLlgCKG8CDBw+iQ4cOePXV\nV3H//n34+vqWO0ZlZXnRix3AAnqxA1ikZLxB6ignvyinPNDN4Dyjm8FrC90MTog1oZvBCc+MYgew\ngFHsABaRy5gH5eQX5ZQHauQEUbNJU8VZeksgw/NnFGOi2cqYTCaxI1iEcvKLcsoDNXICYIxJfpk/\nf77oGf5KxuzsLLH/mjkPHjwQO4JFKCe/KKc8UCNHCCHEalEj94JKTU0VO0K15JARoJx8o5z8kktO\nodDVlTyT920FhBAiHiGaI8ndJyd39DsDIYRIB3VXEkIIsVrUyBFCCLFa1MjxZP/+/ejcuTM6duyI\n0NBQUTI4OjpCo9FAp9Nxz+PMyspCv3790KlTJ/Tv37/M5cSLFy9Gx44d0blzZxw4cIBbHxcXB7Va\njY4dO2LGjBl/KdO4cePg4OBQZo5APjM9efIEo0ePRseOHeHl5YXr16/zljM4OBht27aFTqeDTqfD\nvn37RM958+ZN9O7dG127doVKpcLq1asBSK9OK8sptTp9/PgxevToATc3N3Tp0gXz5s0DIK36rCyj\n1OqyRFFREXQ6HQICAgBIoC6rmaWAWKCwsJA5OTmxlJQUVlBQwLRaLUtOTq71HI6OjiwzM7PMujlz\n5rDQ0FDGGGNLlixhc+fOZYwxdu7cOabVallBQQFLSUlhTk5OzGw2M8YY6969Ozt58iRjjLGBAwey\nffv2PXemY8eOsfj4+DLTJ/GZae3atWzy5MmMMcZ+/PFHNnr0aN5yBgcHs+XLl5fbVsyct2/fZgkJ\nCYwxxnJyclinTp1YcnKy5Oq0spxSrNO8vDzGGGNPnz5lPXr0YJGRkZKrz4oySrEuGWNs+fLlbMyY\nMSwgIIAxJv7/d2rkeBAdHc0GDBjAlRcvXswWL15c6zkcHR3ZvXv3yqxzcXFh6enpjLHiLx4XFxfG\nGGOLFi1iS5Ys4bYbMGAAi4mJYbdu3WKdO3fm1v/nP/9hH3300V/K9ewcgXxmGjBgADtx4gRjrPgL\n4OWXX+YtZ3BwMFu2bFm57cTOWdrQoUPZwYMHJVunz+aUcp3m5eUxDw8PdvbsWcnWZ+mMUqzLmzdv\nMn9/f3bkyBE2ePBgxpj4/9+pu5IHaWlpePXVV7ly27ZtkZaWVus5FAoF+vbtCw8PD6xfvx4AcOfO\nHTg4OAAonsaoZMbzW7duoW3btuUyP7u+TZs2vH8WPjOVrvu6devC3t4eWVn8PQ1lzZo10Gq1+PDD\nD7luFqnkTE1NRUJCAnr06CHpOi3J6eXlBUB6dWo2m+Hm5gYHBweui1Vq9VlRRkB6dfnJJ59g6dKl\nqFPnv02L2HVJjRwPpHJvXFRUFBISErBv3z6sXbsWkZGRZV5/droiKZBiphKTJ09GSkoKTCYTWrVq\nhdmzZ4sdiZObm4uRI0di1apVUCqVZV6TUp3m5ubirbfewqpVq2BnZyfJOq1Tpw5MJhP++OMPHDt2\nDBEREWVel0J9PpvRaDRKri5//fVXtGzZEjqdrtJbqcSoS2rkeNCmTRvcvHmTK9+8ebPMbyK1pVWr\nVgCAFi1aYPjw4Th16hQcHByQnp4OALh9+zY3d96zmf/44w+0bdsWbdq0wR9//FFmfZs2bXjNyUem\nkvpt06YNbty4AQAoLCzEw4cP0bx5c15ytmzZkvtPOX78eJw6dUoSOZ8+fYqRI0ciKCgIw4YNAyDN\nOi3J+d5773E5pVqnAGBvb49BgwYhLi5OkvVZOmNsbKzk6jI6Ohq7d+9G+/bt8c477+DIkSMICgoS\nvS6pkeOBh4cHLl++jNTUVBQUFGDbtm0YMmRIrWZ49OgRcnJyAAB5eXk4cOAA1Go1hgwZgs2bNwMA\nNm/ezH3ZDBkyBD/++CMKCgqQkpKCy5cvw9PTE6+88gqaNGmCkydPgjGGLVu2cPvwhY9MQ4cOLXes\n7du3w9/fn7ect2/f5n7euXMnd+WlmDkZY/jwww/RpUsXzJw5k1svtTqtLKfU6vTevXtcN19+fj4O\nHjwInU4nqfqsLGNJwyGVuly0aBFu3ryJlJQU/Pjjj+jTpw+2bNkifl0+1+giKWfv3r2sU6dOzMnJ\niS1atKjW3//atWtMq9UyrVbLunbtymXIzMxk/v7+rGPHjqxfv37s/v373D4LFy5kTk5OzMXFhe3f\nv59bHxsby1QqFXNycmLTp0//S7nefvtt1qpVK2Zra8vatm3LNm7cyGumx48fs8DAQObs7Mx69OjB\nUlJSeMm5YcMGFhQUxNRqNdNoNGzo0KHc4LmYOSMjI5lCoWBarZa5ubkxNzc3tm/fPsnVaUU59+7d\nK7k6TUpKYjqdjmm1WqZWq9lXX33FGOP3/81fzVlZRqnVZWlGo5G7ulLsuqRnVxJCCLFa1F1JCCHE\nalEjRwghxGpRI0cIIcRqUSNHCCHEalEjRwghxGpRI0cIIcRq/T+nBnG/WbsPNQAAAABJRU5ErkJg\ngg==\n",
"text": "<matplotlib.figure.Figure at 0x3478c90>"
}
],
"prompt_number": 13
},
{
"cell_type": "markdown",
"metadata": {},
"source": "## Pick out `doc_id` corresponding to different administration"
},
{
"cell_type": "code",
"collapsed": false,
"input": "nixon_ids = meta[meta.written_admin == 'Nixon'].index\ncarter_ids = meta[meta.written_admin == 'Carter'].index\ntruman_ids = meta[meta.written_admin == 'Truman'].index",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 14
},
{
"cell_type": "markdown",
"metadata": {},
"source": "## Load a TextFileStreamer\n\n* Streams various `info` from a source of text files\n* Info includes `doc_id` (derived from the filename), and `tokens` (you can specify a `tokenizer_func`)."
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Shuffle the paths to ensure we don't bias results\nstreamer = streamers.TextFileStreamer(text_base_path=text_base_path)",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 16
},
{
"cell_type": "markdown",
"metadata": {},
"source": "## Make a gensim dictionary\n\n* A dictionary is built using a stream of tokens (an *iterator* over token lists)\n* Every Streamer has a `token_stream` method that returns a token stream"
},
{
"cell_type": "code",
"collapsed": false,
"input": "dictionary = corpora.Dictionary(streamer.token_stream())\ndictionary.filter_extremes()\ndictionary.compactify()\nprint dictionary",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "Dictionary(100000 unique tokens)\n"
}
],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Look at the words in the dict\nwords_docfreq = gensim_helpers.get_words_docfreq(dictionary)\nwords_docfreq.head()",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>tokenid</th>\n <th>docfreq</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>out</th>\n <td> 71130</td>\n <td> 57414</td>\n </tr>\n <tr>\n <th>made</th>\n <td> 37045</td>\n <td> 56389</td>\n </tr>\n <tr>\n <th>now</th>\n <td> 53861</td>\n <td> 54971</td>\n </tr>\n <tr>\n <th>two</th>\n <td> 41694</td>\n <td> 54938</td>\n </tr>\n <tr>\n <th>action</th>\n <td> 84108</td>\n <td> 54648</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"output_type": "pyout",
"prompt_number": 20,
"text": " tokenid docfreq\nout 71130 57414\nmade 37045 56389\nnow 53861 54971\ntwo 41694 54938\naction 84108 54648"
}
],
"prompt_number": 20
},
{
"cell_type": "markdown",
"metadata": {},
"source": "## Analyze topics for restricted data sets"
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Load a SvmLightPlus corpus and save to disk\ncorpus = gensim_helpers.SvmLightPlusCorpus.from_streamer_dict(streamer, dictionary, corpus_path)",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Create a new corpus with only carter documents\ncorpus_carter = gensim_helpers.SvmLightPlusCorpus(corpus_path, doc_id_filter=carter_ids)",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 45
},
{
"cell_type": "code",
"collapsed": false,
"input": "lda = models.LdaModel(corpus_carter, id2word=dictionary, num_topics=5, passes=3, chunksize=8000)",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 47
},
{
"cell_type": "code",
"collapsed": false,
"input": "for t in xrange(lda.num_topics):\n print('topic %s' % t)\n print(lda.print_topic(t, topn=7))",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "topic 0\n0.010*shadrin + 0.007*intelligence + 0.006*cia + 0.005*soviet + 0.005*washington + 0.004*mr + 0.004*house\ntopic 1\n0.005*government + 0.005*political + 0.004*release + 0.004*foreign + 0.004*policy + 0.004*soviet + 0.004*approved\ntopic 2\n0.006*military + 0.006*force + 0.005*air + 0.005*training + 0.005*mission + 0.004*general + 0.004*intelligence\ntopic 3\n0.004*text + 0.004*relations + 0.004*secretary + 0.004*secret + 0.004*meeting + 0.004*chinese + 0.004*soviet\ntopic 4\n0.015*soviet + 0.009*defense + 0.008*release + 0.008*approved + 0.006*military + 0.006*percent + 0.005*forces\n"
}
],
"prompt_number": 48
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Create a new corpus with only nixon documents\ncorpus_nixon = gensim_helpers.SvmLightPlusCorpus(corpus_path, doc_id_filter=nixon_ids)\nlda = models.LdaModel(corpus_nixon, id2word=dictionary, num_topics=5, passes=3)",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 52
},
{
"cell_type": "code",
"collapsed": false,
"input": "for t in xrange(lda.num_topics):\n print('topic %s' % t)\n print(lda.print_topic(t, topn=7))",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "topic 0\n0.006*countries + 0.006*economic + 0.005*policy + 0.005*states + 0.004*united + 0.004*trade + 0.004*foreign\ntopic 1\n0.009*mr + 0.008*kissinger + 0.006*out + 0.006*think + 0.005*top + 0.005*want + 0.005*now\ntopic 2\n0.010*french + 0.008*military + 0.008*forces + 0.006*vietnam + 0.006*states + 0.005*united + 0.005*general\ntopic 3\n0.006*text + 0.005*united + 0.005*states + 0.005*illegible + 0.004*government + 0.004*mr + 0.004*minister\ntopic 4\n0.006*program + 0.005*national + 0.005*federal + 0.004*under + 0.004*programs + 0.004*new + 0.003*states\n"
}
],
"prompt_number": 54
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Get a frame with topic scores for all docs\ntopics_df = gensim_helpers.get_topics_df(corpus, lda)",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 55
},
{
"cell_type": "code",
"collapsed": false,
"input": "# This is the doc-by-doc topic scores. This can be used to find cosine distance between docs\ntopics_df.head()",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>topic_0</th>\n <th>topic_1</th>\n <th>topic_2</th>\n <th>topic_3</th>\n <th>topic_4</th>\n </tr>\n <tr>\n <th>doc_id</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>203522</th>\n <td> 0.968993</td>\n <td> 0.000000</td>\n <td> 0.000000</td>\n <td> 0.022855</td>\n <td> 0.000000</td>\n </tr>\n <tr>\n <th>210679</th>\n <td> 0.333655</td>\n <td> 0.284794</td>\n <td> 0.050252</td>\n <td> 0.000000</td>\n <td> 0.330941</td>\n </tr>\n <tr>\n <th>205931</th>\n <td> 0.000000</td>\n <td> 0.133802</td>\n <td> 0.092086</td>\n <td> 0.772075</td>\n <td> 0.000000</td>\n </tr>\n <tr>\n <th>292915</th>\n <td> 0.336099</td>\n <td> 0.618505</td>\n <td> 0.000000</td>\n <td> 0.023786</td>\n <td> 0.017883</td>\n </tr>\n <tr>\n <th>290433</th>\n <td> 0.172216</td>\n <td> 0.046617</td>\n <td> 0.058449</td>\n <td> 0.721021</td>\n <td> 0.000000</td>\n </tr>\n </tbody>\n</table>\n</div>",
"metadata": {},
"output_type": "pyout",
"prompt_number": 56,
"text": " topic_0 topic_1 topic_2 topic_3 topic_4\ndoc_id \n203522 0.968993 0.000000 0.000000 0.022855 0.000000\n210679 0.333655 0.284794 0.050252 0.000000 0.330941\n205931 0.000000 0.133802 0.092086 0.772075 0.000000\n292915 0.336099 0.618505 0.000000 0.023786 0.017883\n290433 0.172216 0.046617 0.058449 0.721021 0.000000"
}
],
"prompt_number": 56
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Consistent with the flat prior, topics are more-or-less distributed evenly\ntopics_df.mean()",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 57,
"text": "topic_0 0.203761\ntopic_1 0.145299\ntopic_2 0.142916\ntopic_3 0.352185\ntopic_4 0.153257\ndtype: float64"
}
],
"prompt_number": 57
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Reality check: Topics for each doc sum to 1 (almost)\ntopics_df.sum(axis=1).describe()",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 59,
"text": "count 117509.000000\nmean 0.997418\nstd 0.003374\nmin 0.961320\n25% 0.996520\n50% 0.998747\n75% 0.999611\nmax 1.000000\ndtype: float64"
}
],
"prompt_number": 59
},
{
"cell_type": "code",
"collapsed": false,
"input": "",
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment