Skip to content

Instantly share code, notes, and snippets.

@Prooffreader
Created March 28, 2016 21:05
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Prooffreader/11da9e99c3b3f2bbf7f3 to your computer and use it in GitHub Desktop.
Save Prooffreader/11da9e99c3b3f2bbf7f3 to your computer and use it in GitHub Desktop.
Jupyter notebook to find most characteristic words in successful and unsuccessful whitehouse.org petitions
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#Finding most characteristic words in successful and unsuccessful whitehouse.org petitions\n",
"\n",
"By David Taylor (www.prooffreader.com) March 2016\n",
"\n",
"* Dunning log-likelihood method used to compare corpus of words in body of successful petitions (enough signatures garnered to prompt a White House Administration response) with corpus of words in body of unsuccessful petitions\n",
"* To remove weighting effect if words are repeated numerous times in body, corpora limited to at most one appearance of word per petition\n",
"\n",
"Steps before this notebook:\n",
"\n",
"1. Downloaded MySQL dump of historical petitions from https://petitions.whitehouse.gov/developers\n",
"2. Used MySQL to export wtp_data_petitions table as a CSV"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from collections import Counter\n",
"from textblob import TextBlob, Word\n",
"import re\n",
"import pandas as pd\n",
"from numpy import log"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"serial 1\n",
"id 4e7b352b4bd5046c04000000\n",
"type petition\n",
"title Stop Animal Homelessness at Its Roots\n",
"body Every year in the United States, an estimated ...\n",
"signature_threshold 5000\n",
"signature_count 11786\n",
"signatures_needed 0\n",
"url https://petitions.whitehouse.gov/petition/stop...\n",
"deadline 30\n",
"status Reviewed\n",
"created 2011-09-22 13:18:43\n",
"Name: 0, dtype: object"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# load csv and name columns\n",
"df = pd.read_csv('wtp_data_petitions.csv', index_col=None, names=['serial',\n",
" 'id', 'type', 'title', 'body', 'signature_threshold', \n",
" 'signature_count', 'signatures_needed', 'url', 'deadline',\n",
" 'status', 'created'])\n",
"\n",
"# 'created' column is in epoch seconds; change to timestamp\n",
"df['created'] = pd.to_datetime(df['created'],unit='s')\n",
"\n",
"# view first record\n",
"df.iloc[0]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# limit to petitions after their deadlines.\n",
"# 'Reviewed' means a successful petition, 'Closed' unsuccessful\n",
"df = df[df.status.isin(['Reviewed', 'Closed'])]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def loglikely(n1, t1, n2, t2, add1=False):\n",
" \"\"\"Calculates Dunning log likelihood of an observation of\n",
" frequency n1 in a corpus of size t1, compared to a frequency n2\n",
" in a corpus of size t2. If result is positive, it is more\n",
" likely to occur in corpus 1, otherwise in corpus 2.\"\"\"\n",
" from numpy import log\n",
" if add1:\n",
" n1 += 1\n",
" n2 += 1\n",
" try:\n",
" e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values\n",
" e2 = t2*1.0*(n1+n2)/(t1+t2)\n",
" LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))\n",
" if n2*1.0/t2 > n1*1.0/t1:\n",
" LL = -LL\n",
" return LL\n",
" except:\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Create four counters, of title and body\n",
"# Note that only body was used for this analysis\n",
"# Title did not result in substantially different results\n",
"# except for having fewer words\n",
"\n",
"counts = {'Reviewed': {'title': Counter(), 'body': Counter()},\n",
" 'Closed': {'title': Counter(), 'body': Counter()}}\n",
"\n",
"for idx, row in df.iterrows():\n",
" status = row.status\n",
" for sample in ['title', 'body']:\n",
" text = row[sample].lower()\n",
" # the text is full of periods without following spaces, e.g.\n",
" # \"...this petition.Therefore, ...\" Replace them with spaces.\n",
" text = re.sub('([a-z])\\.([a-z])', r'\\1. \\2', text)\n",
" words = TextBlob(text).words\n",
" # Set used in order not to multiple-count words that appear more\n",
" # than once in a petition\n",
" lemmas = set([Word(x).lemmatize() for x in words])\n",
" counts[status][sample].update(list(lemmas))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# total size of each corpus (Reviewed and Closed)\n",
"\n",
"t1, t2 = {}, {}\n",
"for sample in ['title', 'body']:\n",
" t1[sample] = sum(counts['Reviewed'][sample].values())\n",
" t2[sample] = sum(counts['Closed'][sample].values())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Calculate frequency per thousand petitions for each word\n",
"\n",
"fpt = {'Reviewed': {'title': {}, 'body': {}},\n",
" 'Closed': {'title': {}, 'body': {}}}\n",
"\n",
"for status in ['Reviewed', 'Closed']:\n",
" length = len(df[df.status == status])\n",
" for sample in ['title', 'body']:\n",
" for lemma, count in counts[status][sample].items():\n",
" fpt[status][sample][lemma] = count * 1000.0 / length"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# calculate log-likelihoods\n",
"\n",
"word2LL = {'title': {}, 'body': {}}\n",
"for sample in ['title', 'body']:\n",
" for lemma, n1 in counts['Reviewed'][sample].items():\n",
" if lemma in counts['Closed'][sample].keys():\n",
" word2LL[sample][lemma] = loglikely(n1, t1[sample], \n",
" counts['Closed'][sample][lemma],\n",
" t2[sample], add1=True)\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" LL_body reviewed_fpt closed_fpt\n",
"gun 29.718629 88.435374 23.051331\n",
"tragedy 22.662834 40.816327 6.653992\n",
"access 21.594185 95.238095 33.269962\n",
"imperative 18.144088 17.006803 0.950570\n",
"regulatory 15.501128 20.408163 2.376426\n",
" LL_body reviewed_fpt closed_fpt\n",
"down -7.129465 13.605442 46.340304\n",
"genocide -7.872795 10.204082 42.775665\n",
"i -8.210591 57.823129 112.642586\n",
"code -9.142554 6.802721 39.923954\n",
"say -12.908181 6.802721 48.479087\n"
]
}
],
"source": [
"dfb = pd.DataFrame.from_dict(word2LL['body'], orient='index')\n",
"dfb.columns = ['LL_body']\n",
"dfb.sort_values('LL_body', ascending=False, inplace=True)\n",
"dfb['reviewed_fpt'] = [fpt['Reviewed']['body'][x] if x in fpt['Reviewed']['body'].keys() else 0 for x in dfb.index]\n",
"dfb['closed_fpt'] = [fpt['Closed']['body'][x] if x in fpt['Closed']['body'].keys() else 0 for x in dfb.index]\n",
"print(dfb.head())\n",
"print(dfb.tail())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dfb.to_csv('whitehouse_petitions_keyness.csv')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# to produce five randomly selected titles for each of the top words\n",
"# sometimes UnicodeEncodeErrors were produced due to foreign characters, e.g. Chinese\n",
"# I was lazy and didn't handle them, I just reran the code until no\n",
"# errors were thrown"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df['body_lower'] = df.body.apply(lambda x: x.lower())\n",
"df['title_lower'] = df.title.apply(lambda x: x.lower())\n",
"text = []\n",
"deactivated = False\n",
"for idx, row in dfb.iterrows():\n",
" if row.name == 'condition':\n",
" deactivated = False\n",
" if not deactivated:\n",
" df_ = df[df.body_lower.str.contains(row.name)]\n",
" assert len(df_) >= 5\n",
" df_ = df_.sample(n=5)\n",
" text.append(row.name)\n",
" for idx2, row2 in df_.iterrows():\n",
" item = '- '+row2.title\n",
" text.append(item)\n",
" text.append('')\n",
" if row.name == 'regulation':\n",
" deactivated = True\n",
"\n",
"with open('LL_samples.txt', 'w+') as f:\n",
" f.write('\\n'.join(text))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment