Prooffreader/white_house_petitions.ipynb

## white_house_petitions.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#Finding most characteristic words in successful and unsuccessful whitehouse.org petitions\n",
    "\n",
    "By David Taylor (www.prooffreader.com) March 2016\n",
    "\n",
    "* Dunning log-likelihood method used to compare corpus of words in body of successful petitions (enough signatures garnered to prompt a White House Administration response) with corpus of words in body of unsuccessful petitions\n",
    "* To remove weighting effect if words are repeated numerous times in body, corpora limited to at most one appearance of word per petition\n",
    "\n",
    "Steps before this notebook:\n",
    "\n",
    "1. Downloaded MySQL dump of historical petitions from https://petitions.whitehouse.gov/developers\n",
    "2. Used MySQL to export wtp_data_petitions table as a CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "from textblob import TextBlob, Word\n",
    "import re\n",
    "import pandas as pd\n",
    "from numpy import log"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "serial                                                                 1\n",
       "id                                              4e7b352b4bd5046c04000000\n",
       "type                                                            petition\n",
       "title                              Stop Animal Homelessness at Its Roots\n",
       "body                   Every year in the United States, an estimated ...\n",
       "signature_threshold                                                 5000\n",
       "signature_count                                                    11786\n",
       "signatures_needed                                                      0\n",
       "url                    https://petitions.whitehouse.gov/petition/stop...\n",
       "deadline                                                              30\n",
       "status                                                          Reviewed\n",
       "created                                              2011-09-22 13:18:43\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load csv and name columns\n",
    "df = pd.read_csv('wtp_data_petitions.csv', index_col=None, names=['serial',\n",
    "                 'id', 'type', 'title',  'body', 'signature_threshold', \n",
    "                 'signature_count', 'signatures_needed', 'url', 'deadline',\n",
    "                 'status', 'created'])\n",
    "\n",
    "# 'created' column is in epoch seconds; change to timestamp\n",
    "df['created'] = pd.to_datetime(df['created'],unit='s')\n",
    "\n",
    "# view first record\n",
    "df.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# limit to petitions after their deadlines.\n",
    "# 'Reviewed' means a successful petition, 'Closed' unsuccessful\n",
    "df = df[df.status.isin(['Reviewed', 'Closed'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def loglikely(n1, t1, n2, t2, add1=False):\n",
    "    \"\"\"Calculates Dunning log likelihood of an observation of\n",
    "    frequency n1 in a corpus of size t1, compared to a frequency n2\n",
    "    in a corpus of size t2. If result is positive, it is more\n",
    "    likely to occur in corpus 1, otherwise in corpus 2.\"\"\"\n",
    "    from numpy import log\n",
    "    if add1:\n",
    "        n1 += 1\n",
    "        n2 += 1\n",
    "    try:\n",
    "        e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values\n",
    "        e2 = t2*1.0*(n1+n2)/(t1+t2)\n",
    "        LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))\n",
    "        if n2*1.0/t2 > n1*1.0/t1:\n",
    "            LL = -LL\n",
    "        return LL\n",
    "    except:\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Create four counters, of title and body\n",
    "# Note that only body was used for this analysis\n",
    "# Title did not result in substantially different results\n",
    "# except for having fewer words\n",
    "\n",
    "counts = {'Reviewed': {'title': Counter(), 'body': Counter()},\n",
    "          'Closed': {'title': Counter(), 'body': Counter()}}\n",
    "\n",
    "for idx, row in df.iterrows():\n",
    "    status = row.status\n",
    "    for sample in ['title', 'body']:\n",
    "        text = row[sample].lower()\n",
    "        # the text is full of periods without following spaces, e.g.\n",
    "        # \"...this petition.Therefore, ...\" Replace them with spaces.\n",
    "        text = re.sub('([a-z])\\.([a-z])', r'\\1. \\2', text)\n",
    "        words = TextBlob(text).words\n",
    "        # Set used in order not to multiple-count words that appear more\n",
    "        # than once in a petition\n",
    "        lemmas = set([Word(x).lemmatize() for x in words])\n",
    "        counts[status][sample].update(list(lemmas))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# total size of each corpus (Reviewed and Closed)\n",
    "\n",
    "t1, t2 = {}, {}\n",
    "for sample in ['title', 'body']:\n",
    "    t1[sample] = sum(counts['Reviewed'][sample].values())\n",
    "    t2[sample] = sum(counts['Closed'][sample].values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Calculate frequency per thousand petitions for each word\n",
    "\n",
    "fpt = {'Reviewed': {'title': {}, 'body': {}},\n",
    "             'Closed': {'title': {}, 'body': {}}}\n",
    "\n",
    "for status in ['Reviewed', 'Closed']:\n",
    "    length = len(df[df.status == status])\n",
    "    for sample in ['title', 'body']:\n",
    "        for lemma, count in counts[status][sample].items():\n",
    "            fpt[status][sample][lemma] = count * 1000.0 / length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# calculate log-likelihoods\n",
    "\n",
    "word2LL = {'title': {}, 'body': {}}\n",
    "for sample in ['title', 'body']:\n",
    "    for lemma, n1 in counts['Reviewed'][sample].items():\n",
    "        if lemma in counts['Closed'][sample].keys():\n",
    "            word2LL[sample][lemma] = loglikely(n1, t1[sample], \n",
    "                                               counts['Closed'][sample][lemma],\n",
    "                                               t2[sample], add1=True)\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              LL_body  reviewed_fpt  closed_fpt\n",
      "gun         29.718629     88.435374   23.051331\n",
      "tragedy     22.662834     40.816327    6.653992\n",
      "access      21.594185     95.238095   33.269962\n",
      "imperative  18.144088     17.006803    0.950570\n",
      "regulatory  15.501128     20.408163    2.376426\n",
      "            LL_body  reviewed_fpt  closed_fpt\n",
      "down      -7.129465     13.605442   46.340304\n",
      "genocide  -7.872795     10.204082   42.775665\n",
      "i         -8.210591     57.823129  112.642586\n",
      "code      -9.142554      6.802721   39.923954\n",
      "say      -12.908181      6.802721   48.479087\n"
     ]
    }
   ],
   "source": [
    "dfb = pd.DataFrame.from_dict(word2LL['body'], orient='index')\n",
    "dfb.columns = ['LL_body']\n",
    "dfb.sort_values('LL_body', ascending=False, inplace=True)\n",
    "dfb['reviewed_fpt'] = [fpt['Reviewed']['body'][x] if x in fpt['Reviewed']['body'].keys() else 0 for x in dfb.index]\n",
    "dfb['closed_fpt'] = [fpt['Closed']['body'][x] if x in fpt['Closed']['body'].keys() else 0 for x in dfb.index]\n",
    "print(dfb.head())\n",
    "print(dfb.tail())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dfb.to_csv('whitehouse_petitions_keyness.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# to produce five randomly selected titles for each of the top words\n",
    "# sometimes UnicodeEncodeErrors were produced due to foreign characters, e.g. Chinese\n",
    "# I was lazy and didn't handle them, I just reran the code until no\n",
    "# errors were thrown"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df['body_lower'] = df.body.apply(lambda x: x.lower())\n",
    "df['title_lower'] = df.title.apply(lambda x: x.lower())\n",
    "text = []\n",
    "deactivated = False\n",
    "for idx, row in dfb.iterrows():\n",
    "    if row.name == 'condition':\n",
    "        deactivated = False\n",
    "    if not deactivated:\n",
    "        df_ = df[df.body_lower.str.contains(row.name)]\n",
    "        assert len(df_) >= 5\n",
    "        df_ = df_.sample(n=5)\n",
    "        text.append(row.name)\n",
    "        for idx2, row2 in df_.iterrows():\n",
    "            item = '- '+row2.title\n",
    "            text.append(item)\n",
    "        text.append('')\n",
    "    if row.name == 'regulation':\n",
    "        deactivated = True\n",
    "\n",
    "with open('LL_samples.txt', 'w+') as f:\n",
    "    f.write('\\n'.join(text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#Finding most characteristic words in successful and unsuccessful whitehouse.org petitions\n",
	"\n",
	"By David Taylor (www.prooffreader.com) March 2016\n",
	"\n",
	"* Dunning log-likelihood method used to compare corpus of words in body of successful petitions (enough signatures garnered to prompt a White House Administration response) with corpus of words in body of unsuccessful petitions\n",
	"* To remove weighting effect if words are repeated numerous times in body, corpora limited to at most one appearance of word per petition\n",
	"\n",
	"Steps before this notebook:\n",
	"\n",
	"1. Downloaded MySQL dump of historical petitions from https://petitions.whitehouse.gov/developers\n",
	"2. Used MySQL to export wtp_data_petitions table as a CSV"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from collections import Counter\n",
	"from textblob import TextBlob, Word\n",
	"import re\n",
	"import pandas as pd\n",
	"from numpy import log"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"serial 1\n",
	"id 4e7b352b4bd5046c04000000\n",
	"type petition\n",
	"title Stop Animal Homelessness at Its Roots\n",
	"body Every year in the United States, an estimated ...\n",
	"signature_threshold 5000\n",
	"signature_count 11786\n",
	"signatures_needed 0\n",
	"url https://petitions.whitehouse.gov/petition/stop...\n",
	"deadline 30\n",
	"status Reviewed\n",
	"created 2011-09-22 13:18:43\n",
	"Name: 0, dtype: object"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# load csv and name columns\n",
	"df = pd.read_csv('wtp_data_petitions.csv', index_col=None, names=['serial',\n",
	" 'id', 'type', 'title', 'body', 'signature_threshold', \n",
	" 'signature_count', 'signatures_needed', 'url', 'deadline',\n",
	" 'status', 'created'])\n",
	"\n",
	"# 'created' column is in epoch seconds; change to timestamp\n",
	"df['created'] = pd.to_datetime(df['created'],unit='s')\n",
	"\n",
	"# view first record\n",
	"df.iloc[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# limit to petitions after their deadlines.\n",
	"# 'Reviewed' means a successful petition, 'Closed' unsuccessful\n",
	"df = df[df.status.isin(['Reviewed', 'Closed'])]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def loglikely(n1, t1, n2, t2, add1=False):\n",
	" \"\"\"Calculates Dunning log likelihood of an observation of\n",
	" frequency n1 in a corpus of size t1, compared to a frequency n2\n",
	" in a corpus of size t2. If result is positive, it is more\n",
	" likely to occur in corpus 1, otherwise in corpus 2.\"\"\"\n",
	" from numpy import log\n",
	" if add1:\n",
	" n1 += 1\n",
	" n2 += 1\n",
	" try:\n",
	" e1 = t11.0(n1+n2)/(t1+t2) # expected values\n",
	" e2 = t21.0(n1+n2)/(t1+t2)\n",
	" LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))\n",
	" if n21.0/t2 > n11.0/t1:\n",
	" LL = -LL\n",
	" return LL\n",
	" except:\n",
	" return None"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Create four counters, of title and body\n",
	"# Note that only body was used for this analysis\n",
	"# Title did not result in substantially different results\n",
	"# except for having fewer words\n",
	"\n",
	"counts = {'Reviewed': {'title': Counter(), 'body': Counter()},\n",
	" 'Closed': {'title': Counter(), 'body': Counter()}}\n",
	"\n",
	"for idx, row in df.iterrows():\n",
	" status = row.status\n",
	" for sample in ['title', 'body']:\n",
	" text = row[sample].lower()\n",
	" # the text is full of periods without following spaces, e.g.\n",
	" # \"...this petition.Therefore, ...\" Replace them with spaces.\n",
	" text = re.sub('([a-z])\\.([a-z])', r'\\1. \\2', text)\n",
	" words = TextBlob(text).words\n",
	" # Set used in order not to multiple-count words that appear more\n",
	" # than once in a petition\n",
	" lemmas = set([Word(x).lemmatize() for x in words])\n",
	" counts[status][sample].update(list(lemmas))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# total size of each corpus (Reviewed and Closed)\n",
	"\n",
	"t1, t2 = {}, {}\n",
	"for sample in ['title', 'body']:\n",
	" t1[sample] = sum(counts['Reviewed'][sample].values())\n",
	" t2[sample] = sum(counts['Closed'][sample].values())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Calculate frequency per thousand petitions for each word\n",
	"\n",
	"fpt = {'Reviewed': {'title': {}, 'body': {}},\n",
	" 'Closed': {'title': {}, 'body': {}}}\n",
	"\n",
	"for status in ['Reviewed', 'Closed']:\n",
	" length = len(df[df.status == status])\n",
	" for sample in ['title', 'body']:\n",
	" for lemma, count in counts[status][sample].items():\n",
	" fpt[status][sample][lemma] = count * 1000.0 / length"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# calculate log-likelihoods\n",
	"\n",
	"word2LL = {'title': {}, 'body': {}}\n",
	"for sample in ['title', 'body']:\n",
	" for lemma, n1 in counts['Reviewed'][sample].items():\n",
	" if lemma in counts['Closed'][sample].keys():\n",
	" word2LL[sample][lemma] = loglikely(n1, t1[sample], \n",
	" counts['Closed'][sample][lemma],\n",
	" t2[sample], add1=True)\n",
	" \n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" LL_body reviewed_fpt closed_fpt\n",
	"gun 29.718629 88.435374 23.051331\n",
	"tragedy 22.662834 40.816327 6.653992\n",
	"access 21.594185 95.238095 33.269962\n",
	"imperative 18.144088 17.006803 0.950570\n",
	"regulatory 15.501128 20.408163 2.376426\n",
	" LL_body reviewed_fpt closed_fpt\n",
	"down -7.129465 13.605442 46.340304\n",
	"genocide -7.872795 10.204082 42.775665\n",
	"i -8.210591 57.823129 112.642586\n",
	"code -9.142554 6.802721 39.923954\n",
	"say -12.908181 6.802721 48.479087\n"
	]
	}
	],
	"source": [
	"dfb = pd.DataFrame.from_dict(word2LL['body'], orient='index')\n",
	"dfb.columns = ['LL_body']\n",
	"dfb.sort_values('LL_body', ascending=False, inplace=True)\n",
	"dfb['reviewed_fpt'] = [fpt['Reviewed']['body'][x] if x in fpt['Reviewed']['body'].keys() else 0 for x in dfb.index]\n",
	"dfb['closed_fpt'] = [fpt['Closed']['body'][x] if x in fpt['Closed']['body'].keys() else 0 for x in dfb.index]\n",
	"print(dfb.head())\n",
	"print(dfb.tail())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"dfb.to_csv('whitehouse_petitions_keyness.csv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# to produce five randomly selected titles for each of the top words\n",
	"# sometimes UnicodeEncodeErrors were produced due to foreign characters, e.g. Chinese\n",
	"# I was lazy and didn't handle them, I just reran the code until no\n",
	"# errors were thrown"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df['body_lower'] = df.body.apply(lambda x: x.lower())\n",
	"df['title_lower'] = df.title.apply(lambda x: x.lower())\n",
	"text = []\n",
	"deactivated = False\n",
	"for idx, row in dfb.iterrows():\n",
	" if row.name == 'condition':\n",
	" deactivated = False\n",
	" if not deactivated:\n",
	" df_ = df[df.body_lower.str.contains(row.name)]\n",
	" assert len(df_) >= 5\n",
	" df_ = df_.sample(n=5)\n",
	" text.append(row.name)\n",
	" for idx2, row2 in df_.iterrows():\n",
	" item = '- '+row2.title\n",
	" text.append(item)\n",
	" text.append('')\n",
	" if row.name == 'regulation':\n",
	" deactivated = True\n",
	"\n",
	"with open('LL_samples.txt', 'w+') as f:\n",
	" f.write('\\n'.join(text))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}