bertomartin/Taxes Paid and Personal Income.ipynb

## Taxes Paid and Personal Income.ipynb
{
 "metadata": {
  "name": "",
  "signature": "sha256:8150453367b906e07eef200351901fb9c424c6eeeb84438199e61c4a49c6d449"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Gross Tax Collections by State"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "source = IRS http://www.irs.gov/uac/SOI-Tax-Stats-Gross-Collections,-by-Type-of-Tax-and-State,-Fiscal-Year-IRS-Data-Book-Table-5\n",
      "\n",
      "**Notes:**\n",
      "\n",
      "* Estate and trust income tax is tax on the income (or loss) of an estate or a trust. Prior to FY 2008, this amount was reported as part of the individual income tax total\n",
      "* Estate tax is the tax on a property transfer at death"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "import pandas as pd\n",
      "import glob"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 28
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "gross = pd.DataFrame()\n",
      "grossfiles = glob.glob('gross/gross????.csv')\n",
      "for file in grossfiles:\n",
      "    df = pd.read_csv(\n",
      "        file, \n",
      "        header=0,\n",
      "        thousands=',',\n",
      "        names=['state', 'total','business','individual_total', \n",
      "               'individual_witheld_fica', 'individual_payments_seca', \n",
      "               'ui', 'rr', 'estate_trust', 'estate', 'gift', 'excise']\n",
      "    )\n",
      "    df.dropna(subset=['state'],inplace=True)\n",
      "    df['year'] = year\n",
      "    gross = pd.concat([gross, df])\n",
      "\n",
      "#hacky stuff to fix up state names\n",
      "gross['state'] = gross['state'].str.strip()\n",
      "gross['state'] = gross['state'].str.replace('United States, total \\[4\\]', 'United States')\n",
      "gross['state'] = gross['state'].str.replace('US Totals', 'United States')\n",
      "gross['state'] = gross['state'].str.replace('US Total', 'United States')\n",
      "#gross.replace('*','')\n",
      "\n",
      "#hacky stuff to get rid of non-numbers\n",
      "for col in gross.columns.values:\n",
      "    if col <> 'state':\n",
      "        gross[col] = gross[col].replace('[^0-9.]+', '', regex=True) \n",
      "\n",
      "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n",
      "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n",
      "gross['ui'] = gross['ui'].astype(str).convert_objects(convert_numeric=True)\n",
      "gross['estate'] = gross['estate'].astype(str).convert_objects(convert_numeric=True)\n",
      "gross['gift'] = gross['gift'].astype(str).convert_objects(convert_numeric=True)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 29
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Tax Refunds by State"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "source = IRS http://www.irs.gov/uac/SOI-Tax-Stats-Amount-of-Refunds-Issued,-Including-Interest,-by-State-and-Fiscal-Year-IRS-Data-Book-Table-8\n",
      "\n",
      "**Notes**\n",
      "\n",
      "* total tax refunds include overpayment refunds, refunds resulting from examination activity, refundable tax credits, and other refunds required by law. Also includes interest.\n",
      "* employment refunds include refunds of self-employment income taxes under the Self Employment Insurance Contributions Act (SECA); railroad retirement taxes under the Railroad Retirement Tax Act (RRTA); unemployment insurance taxes under the Federal Unemployment Tax Act (FUTA); and Old-Age, Survivors, Disability, and Hospital Insurance (OASDHI) taxes on salaries and wages under the Federal Insurance Contributions Act (FICA)\n",
      "* excise tax refunds exclude refunds of excise taxes paid from U.S. Customs and Border Protection and the Alcohol Tobacco Tax and Trade Bureau"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "refunds = pd.DataFrame()\n",
      "refundfiles = glob.glob('refunds/refunds????.csv')\n",
      "for file in refundfiles:\n",
      "    df = pd.read_csv(\n",
      "        file, \n",
      "        header=0,\n",
      "        thousands=',',\n",
      "        names=['state', 'total','business','individual_income', \n",
      "                'estate_trust', 'employment', 'estate', 'gift', 'excise'],\n",
      "    )\n",
      "    df.dropna(subset=['state'],inplace=True)\n",
      "    df['year'] = year\n",
      "    refunds = pd.concat([refunds, df])\n",
      "    \n",
      "#hacky stuff to fix up state names\n",
      "refunds['state'] = refunds['state'].str.strip()\n",
      "refunds['state'] = refunds['state'].str.replace('US Total', 'United States')\n",
      "\n",
      "#hacky stuff to get rid of non-numbers\n",
      "for col in refunds.columns.values:\n",
      "    if col <> 'state':\n",
      "        refunds[col] = refunds[col].replace('[^0-9.]+', '', regex=True)  \n",
      "\n",
      "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n",
      "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n",
      "refunds['estate'] = refunds['estate'].astype(str).convert_objects(convert_numeric=True)\n",
      "refunds['gift'] = refunds['gift'].astype(str).convert_objects(convert_numeric=True)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 30
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Net Taxes by State"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#create stripped-down, standardized versions of gross collections and refunds, so we can calculate net\n",
      "g = gross[['year', 'state', 'total', 'business', 'individual_total', 'estate', 'gift', 'excise']]\n",
      "g.set_index(['year', 'state'], inplace=True)\n",
      "r = refunds\n",
      "r['estate_trust'] = r['estate_trust'].fillna(0)\n",
      "r['individual_total'] = r['individual_income'] + r['estate_trust'] + r['employment']\n",
      "r = r[['year', 'state', 'total', 'business', 'individual_total', 'estate', 'gift','excise']]\n",
      "r.set_index(['year', 'state'], inplace=True)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 31
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "net = g.sub(r)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 32
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "net.to_csv('net_taxes.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 33
    }
   ],
   "metadata": {}
  }
 ]
}
	{
	"metadata": {
	"name": "",
	"signature": "sha256:8150453367b906e07eef200351901fb9c424c6eeeb84438199e61c4a49c6d449"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"Gross Tax Collections by State"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"source = IRS http://www.irs.gov/uac/SOI-Tax-Stats-Gross-Collections,-by-Type-of-Tax-and-State,-Fiscal-Year-IRS-Data-Book-Table-5\n",
	"\n",
	"Notes:\n",
	"\n",
	"* Estate and trust income tax is tax on the income (or loss) of an estate or a trust. Prior to FY 2008, this amount was reported as part of the individual income tax total\n",
	"* Estate tax is the tax on a property transfer at death"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import glob"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 28
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"gross = pd.DataFrame()\n",
	"grossfiles = glob.glob('gross/gross????.csv')\n",
	"for file in grossfiles:\n",
	" df = pd.read_csv(\n",
	" file, \n",
	" header=0,\n",
	" thousands=',',\n",
	" names=['state', 'total','business','individual_total', \n",
	" 'individual_witheld_fica', 'individual_payments_seca', \n",
	" 'ui', 'rr', 'estate_trust', 'estate', 'gift', 'excise']\n",
	" )\n",
	" df.dropna(subset=['state'],inplace=True)\n",
	" df['year'] = year\n",
	" gross = pd.concat([gross, df])\n",
	"\n",
	"#hacky stuff to fix up state names\n",
	"gross['state'] = gross['state'].str.strip()\n",
	"gross['state'] = gross['state'].str.replace('United States, total \\[4\\]', 'United States')\n",
	"gross['state'] = gross['state'].str.replace('US Totals', 'United States')\n",
	"gross['state'] = gross['state'].str.replace('US Total', 'United States')\n",
	"#gross.replace('*','')\n",
	"\n",
	"#hacky stuff to get rid of non-numbers\n",
	"for col in gross.columns.values:\n",
	" if col <> 'state':\n",
	" gross[col] = gross[col].replace('[^0-9.]+', '', regex=True) \n",
	"\n",
	"#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n",
	"#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n",
	"gross['ui'] = gross['ui'].astype(str).convert_objects(convert_numeric=True)\n",
	"gross['estate'] = gross['estate'].astype(str).convert_objects(convert_numeric=True)\n",
	"gross['gift'] = gross['gift'].astype(str).convert_objects(convert_numeric=True)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 29
	},
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"Tax Refunds by State"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"source = IRS http://www.irs.gov/uac/SOI-Tax-Stats-Amount-of-Refunds-Issued,-Including-Interest,-by-State-and-Fiscal-Year-IRS-Data-Book-Table-8\n",
	"\n",
	"Notes\n",
	"\n",
	"* total tax refunds include overpayment refunds, refunds resulting from examination activity, refundable tax credits, and other refunds required by law. Also includes interest.\n",
	"* employment refunds include refunds of self-employment income taxes under the Self Employment Insurance Contributions Act (SECA); railroad retirement taxes under the Railroad Retirement Tax Act (RRTA); unemployment insurance taxes under the Federal Unemployment Tax Act (FUTA); and Old-Age, Survivors, Disability, and Hospital Insurance (OASDHI) taxes on salaries and wages under the Federal Insurance Contributions Act (FICA)\n",
	"* excise tax refunds exclude refunds of excise taxes paid from U.S. Customs and Border Protection and the Alcohol Tobacco Tax and Trade Bureau"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"refunds = pd.DataFrame()\n",
	"refundfiles = glob.glob('refunds/refunds????.csv')\n",
	"for file in refundfiles:\n",
	" df = pd.read_csv(\n",
	" file, \n",
	" header=0,\n",
	" thousands=',',\n",
	" names=['state', 'total','business','individual_income', \n",
	" 'estate_trust', 'employment', 'estate', 'gift', 'excise'],\n",
	" )\n",
	" df.dropna(subset=['state'],inplace=True)\n",
	" df['year'] = year\n",
	" refunds = pd.concat([refunds, df])\n",
	" \n",
	"#hacky stuff to fix up state names\n",
	"refunds['state'] = refunds['state'].str.strip()\n",
	"refunds['state'] = refunds['state'].str.replace('US Total', 'United States')\n",
	"\n",
	"#hacky stuff to get rid of non-numbers\n",
	"for col in refunds.columns.values:\n",
	" if col <> 'state':\n",
	" refunds[col] = refunds[col].replace('[^0-9.]+', '', regex=True) \n",
	"\n",
	"#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n",
	"#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n",
	"refunds['estate'] = refunds['estate'].astype(str).convert_objects(convert_numeric=True)\n",
	"refunds['gift'] = refunds['gift'].astype(str).convert_objects(convert_numeric=True)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 30
	},
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"Net Taxes by State"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"#create stripped-down, standardized versions of gross collections and refunds, so we can calculate net\n",
	"g = gross[['year', 'state', 'total', 'business', 'individual_total', 'estate', 'gift', 'excise']]\n",
	"g.set_index(['year', 'state'], inplace=True)\n",
	"r = refunds\n",
	"r['estate_trust'] = r['estate_trust'].fillna(0)\n",
	"r['individual_total'] = r['individual_income'] + r['estate_trust'] + r['employment']\n",
	"r = r[['year', 'state', 'total', 'business', 'individual_total', 'estate', 'gift','excise']]\n",
	"r.set_index(['year', 'state'], inplace=True)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 31
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"net = g.sub(r)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 32
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"net.to_csv('net_taxes.csv')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 33
	}
	],
	"metadata": {}
	}
	]
	}