Skip to content

Instantly share code, notes, and snippets.

@DGalt
Last active July 8, 2016 00:16
Show Gist options
  • Save DGalt/6b419549086d3aadff8b7cc072fc60b3 to your computer and use it in GitHub Desktop.
Save DGalt/6b419549086d3aadff8b7cc072fc60b3 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from glob import glob\n",
"import csv\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['./Raw/raw dump June 2016/P046957 - report 1 - all complaints in time frame.xlsx',\n",
" './Raw/raw dump June 2016/P046957 - report 2 - identified accused xi.xlsx',\n",
" './Raw/raw dump June 2016/P046957 - report 3 - police officer witness data xi.xlsx',\n",
" './Raw/raw dump June 2016/P046957 - report 4 - victim data.xlsx',\n",
" './Raw/raw dump June 2016/P046957 - report 5 - complainant (reporting party) data.xlsx']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files = sorted(glob('./Raw/raw dump June 2016/*.xlsx'))\n",
"files"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### P046957 - report 1 - all complaints in time frame.xlsx\n",
"\n",
"** Questions **\n",
"- For some of the Investigator rows there is a datetime in Unnamed: 6 - what is this?\n",
"- What is the column of values labeled as \"Unnamed: 5\" for the Investigator"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.read_excel(files[0], skiprows=9)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"df.dropna(how='all', inplace=True)\n",
"df = df[df['Location Code:'] != 'end of record'].copy()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df['Number:'].fillna(method='ffill', inplace=True)\n",
"df['Number:'] = df['Number:'].astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"invest_df = df[df['Beat:'] == 'Investigator with Current Assignment and Rank:'].copy()\n",
"df = df[df['Beat:'] != 'Investigator with Current Assignment and Rank:'].copy()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 17045 entries, 0 to 50111\n",
"Data columns (total 13 columns):\n",
"Number: 17045 non-null int64\n",
"Beat: 16366 non-null object\n",
"Location Code: 16529 non-null object\n",
"Address of Incident: 12628 non-null object\n",
"Unnamed: 4 12877 non-null object\n",
"Unnamed: 5 824 non-null object\n",
"Unnamed: 6 13615 non-null object\n",
"Incident Date & Time 16531 non-null datetime64[ns]\n",
"Complaint Date 16531 non-null datetime64[ns]\n",
"Closed Date 14753 non-null datetime64[ns]\n",
"Unnamed: 10 0 non-null float64\n",
"Unnamed: 11 0 non-null float64\n",
"Unnamed: 12 0 non-null float64\n",
"dtypes: datetime64[ns](3), float64(3), int64(1), object(6)\n",
"memory usage: 1.8+ MB\n"
]
}
],
"source": [
"df = df.replace('----', float('nan')).replace('-----', float('nan'))\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.drop(['Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number:</th>\n",
" <th>Beat:</th>\n",
" <th>Location Code:</th>\n",
" <th>Address of Incident:</th>\n",
" <th>Unnamed: 4</th>\n",
" <th>Unnamed: 5</th>\n",
" <th>Unnamed: 6</th>\n",
" <th>Incident Date &amp; Time</th>\n",
" <th>Complaint Date</th>\n",
" <th>Closed Date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>106213</td>\n",
" <td>1631</td>\n",
" <td>17</td>\n",
" <td>3700</td>\n",
" <td>N HARLEM AVE</td>\n",
" <td>NaN</td>\n",
" <td>CHICAGO IL 60634</td>\n",
" <td>2015-07-19 21:00:00</td>\n",
" <td>2015-07-20</td>\n",
" <td>2015-09-21</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number: Beat: Location Code: Address of Incident: Unnamed: 4 Unnamed: 5 \\\n",
"0 106213 1631 17 3700 N HARLEM AVE NaN \n",
"\n",
" Unnamed: 6 Incident Date & Time Complaint Date Closed Date \n",
"0 CHICAGO IL 60634 2015-07-19 21:00:00 2015-07-20 2015-09-21 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Beat</th>\n",
" <th>Location_Code</th>\n",
" <th>Address</th>\n",
" <th>Street</th>\n",
" <th>Apartment</th>\n",
" <th>City_State_Zipcode</th>\n",
" <th>Incident_Datetime</th>\n",
" <th>Complaint_Date</th>\n",
" <th>Closed_Date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>106213</td>\n",
" <td>1631</td>\n",
" <td>17</td>\n",
" <td>3700</td>\n",
" <td>N HARLEM AVE</td>\n",
" <td>NaN</td>\n",
" <td>CHICAGO IL 60634</td>\n",
" <td>2015-07-19 21:00:00</td>\n",
" <td>2015-07-20</td>\n",
" <td>2015-09-21</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Beat Location_Code Address Street Apartment \\\n",
"0 106213 1631 17 3700 N HARLEM AVE NaN \n",
"\n",
" City_State_Zipcode Incident_Datetime Complaint_Date Closed_Date \n",
"0 CHICAGO IL 60634 2015-07-19 21:00:00 2015-07-20 2015-09-21 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns = ['Complaint_Number', 'Beat', 'Location_Code', 'Address', 'Street', 'Apartment', \n",
" 'City_State_Zipcode', 'Incident_Datetime', 'Complaint_Date', 'Closed_Date']\n",
"df.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 16404 entries, 1 to 50085\n",
"Data columns (total 13 columns):\n",
"Number: 16404 non-null int64\n",
"Beat: 16404 non-null object\n",
"Location Code: 16278 non-null object\n",
"Address of Incident: 16278 non-null object\n",
"Unnamed: 4 16278 non-null object\n",
"Unnamed: 5 11228 non-null object\n",
"Unnamed: 6 15714 non-null object\n",
"Incident Date & Time 0 non-null datetime64[ns]\n",
"Complaint Date 0 non-null datetime64[ns]\n",
"Closed Date 0 non-null datetime64[ns]\n",
"Unnamed: 10 0 non-null float64\n",
"Unnamed: 11 0 non-null float64\n",
"Unnamed: 12 0 non-null float64\n",
"dtypes: datetime64[ns](3), float64(3), int64(1), object(6)\n",
"memory usage: 1.8+ MB\n"
]
}
],
"source": [
"invest_df.info()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"invest_df.dropna(how='all', axis=1, inplace=True)\n",
"invest_df.drop('Beat:', axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number:</th>\n",
" <th>Location Code:</th>\n",
" <th>Address of Incident:</th>\n",
" <th>Unnamed: 4</th>\n",
" <th>Unnamed: 5</th>\n",
" <th>Unnamed: 6</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>106213</td>\n",
" <td>KLIMAS, ROBERT</td>\n",
" <td>121</td>\n",
" <td>COMMANDER</td>\n",
" <td>0</td>\n",
" <td>2008-08-04 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1038595</td>\n",
" <td>DAUN, SHERRY</td>\n",
" <td>113</td>\n",
" <td>SUPERVISING INV IPRA</td>\n",
" <td>NaN</td>\n",
" <td>2008-12-01 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>1039179</td>\n",
" <td>JONES, VINCENT</td>\n",
" <td>113</td>\n",
" <td>INVESTIGATOR 2 IPRA</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>1053492</td>\n",
" <td>MC GUIRE, TERRENCE</td>\n",
" <td>003</td>\n",
" <td>LIEUTENANT OF POLICE</td>\n",
" <td>376</td>\n",
" <td>1991-11-18 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>1053493</td>\n",
" <td>BREIMON, GERALD</td>\n",
" <td>014</td>\n",
" <td>SERGEANT OF POLICE</td>\n",
" <td>869</td>\n",
" <td>1993-11-22 00:00:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number: Location Code: Address of Incident: Unnamed: 4 \\\n",
"1 106213 KLIMAS, ROBERT 121 COMMANDER \n",
"16 1038595 DAUN, SHERRY 113 SUPERVISING INV IPRA \n",
"19 1039179 JONES, VINCENT 113 INVESTIGATOR 2 IPRA \n",
"22 1053492 MC GUIRE, TERRENCE 003 LIEUTENANT OF POLICE \n",
"25 1053493 BREIMON, GERALD 014 SERGEANT OF POLICE \n",
"\n",
" Unnamed: 5 Unnamed: 6 \n",
"1 0 2008-08-04 00:00:00 \n",
"16 NaN 2008-12-01 00:00:00 \n",
"19 NaN NaN \n",
"22 376 1991-11-18 00:00:00 \n",
"25 869 1993-11-22 00:00:00 "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"invest_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"** Unnamed: 5 and 6 **\n",
"\n",
"Looking at the full list of CPD officers it seems that Unnamed: 5 correspond to the officer's star number, while 6 corresponds to his/her appointment date. I wanted to confirm this, though, so I went ahead and looked at the subset of investigator names that appear in the full list of CPD officers"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Both 1050\n",
"One 154\n",
"dtype: int64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sworn = pd.read_csv('./all_sworn_officer.csv')\n",
"sworn_names = (sworn.LAST_NME + ', ' + sworn.FIRST_NME).tolist()\n",
"mask = invest_df['Location Code:'].drop_duplicates().apply(lambda x: x in sworn_names)\n",
"matches = invest_df.drop_duplicates(subset='Location Code:')[mask]\n",
"\n",
"#the year value in the APPOINTED_DATE column is ambiguous, so it needs to be fixed\n",
"def fix_time(x):\n",
" split = x.split('-')\n",
" if split[-1][0] == '0' or split[-1][0] == '1':\n",
" split[-1] = '20' + split[-1]\n",
" else:\n",
" split[-1] = '19' + split[-1]\n",
" return '-'.join(split)\n",
"\n",
"sworn.APPOINTED_DATE = pd.to_datetime(sworn.APPOINTED_DATE.astype(str).apply(fix_time), \n",
" errors='coerce')\n",
"matches['Unnamed: 6'] = matches['Unnamed: 6'].dt.normalize()\n",
"\n",
"def which_matches(x):\n",
" lname = x['Location Code:'].values[0].split(', ')[0]\n",
" fname = x['Location Code:'].values[0].split(', ')[1]\n",
" \n",
" sub_sworn = sworn[(sworn.LAST_NME==lname)&(sworn.FIRST_NME==fname)]\n",
" \n",
" star = x['Unnamed: 5'].values[0]\n",
" date = x['Unnamed: 6'].values[0]\n",
" star_matches = False\n",
" date_matches = False\n",
" sworn_stars = ['STAR%s'%num for num in range(1, 11)]\n",
" if star in sub_sworn[sworn_stars].dropna(axis=1).values:\n",
" star_matches = True \n",
" if date in sub_sworn.APPOINTED_DATE.values:\n",
" date_matches = True\n",
" \n",
" if star_matches and date_matches:\n",
" return 'Both'\n",
" elif star_matches or date_matches:\n",
" return \"One\"\n",
" else:\n",
" return \"Neither\"\n",
" \n",
"matches.dropna().groupby(level=0).apply(which_matches).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"invest_df.columns = ['Complaint_Number', 'Investigator_Name', \n",
" 'Investigator_Current_Assignment', 'Investigator_Rank', \n",
" 'Investigator_Star', 'Investigator_Appointed_Date']"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"report1 = pd.merge(df, invest_df, on='Complaint_Number', how='outer')"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### P046957 - report 2 - identified accused xi.xlsx\n",
"\n",
"** Questions **\n",
"- Does \"Unnamed: 11\" and \"Unnamed: 13\" correspond to \"Recommended Discipline\" and \"Discipline\" (columns to the left of those)?"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.read_excel(files[1], skiprows=8)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number:</th>\n",
" <th>Accused:</th>\n",
" <th>Birth Yr:</th>\n",
" <th>Gender:</th>\n",
" <th>Race Code:</th>\n",
" <th>Date of Appt:</th>\n",
" <th>Current Unit:</th>\n",
" <th>Current Rank:</th>\n",
" <th>Star:</th>\n",
" <th>Complaint Category</th>\n",
" <th>Finding &amp;\n",
"Recommended Discipline</th>\n",
" <th>Unnamed: 11</th>\n",
" <th>Final Finding &amp; \n",
"Discipline</th>\n",
" <th>Unnamed: 13</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>107901.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>GONZALES, ROBIN</td>\n",
" <td>1981</td>\n",
" <td>F</td>\n",
" <td>S</td>\n",
" <td>2008-04-28</td>\n",
" <td>18.0</td>\n",
" <td>PO</td>\n",
" <td>5137.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>108026.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NaN</td>\n",
" <td>BECKER, JOHN</td>\n",
" <td>1970</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>2000-01-24</td>\n",
" <td>17.0</td>\n",
" <td>PO</td>\n",
" <td>4734.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number: Accused: Birth Yr: Gender: Race Code: \\\n",
"0 107901.0 NaN NaN NaN NaN \n",
"1 NaN GONZALES, ROBIN 1981 F S \n",
"2 NaN NaN end of record NaN NaN \n",
"3 108026.0 NaN NaN NaN NaN \n",
"4 NaN BECKER, JOHN 1970 M WHI \n",
"\n",
" Date of Appt: Current Unit: Current Rank: Star: Complaint Category \\\n",
"0 NaT NaN NaN NaN NaN \n",
"1 2008-04-28 18.0 PO 5137.0 NaN \n",
"2 NaT NaN NaN NaN NaN \n",
"3 NaT NaN NaN NaN NaN \n",
"4 2000-01-24 17.0 PO 4734.0 NaN \n",
"\n",
" Finding &\\nRecommended Discipline Unnamed: 11 Final Finding & \\nDiscipline \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"\n",
" Unnamed: 13 \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 38571 entries, 0 to 38570\n",
"Data columns (total 14 columns):\n",
"Number: 10086 non-null float64\n",
"Accused: 18399 non-null object\n",
"Birth Yr: 28485 non-null object\n",
"Gender: 18399 non-null object\n",
"Race Code: 18399 non-null object\n",
" Date of Appt: 18399 non-null datetime64[ns]\n",
"Current Unit: 18399 non-null float64\n",
"Current Rank: 18297 non-null object\n",
"Star: 17371 non-null float64\n",
"Complaint Category 16945 non-null object\n",
"Finding &\n",
"Recommended Discipline 6295 non-null object\n",
"Unnamed: 11 15234 non-null float64\n",
"Final Finding & \n",
"Discipline 5892 non-null object\n",
"Unnamed: 13 14818 non-null float64\n",
"dtypes: datetime64[ns](1), float64(5), object(8)\n",
"memory usage: 4.1+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number:</th>\n",
" <th>Accused:</th>\n",
" <th>Birth Yr:</th>\n",
" <th>Gender:</th>\n",
" <th>Race Code:</th>\n",
" <th>Date of Appt:</th>\n",
" <th>Current Unit:</th>\n",
" <th>Current Rank:</th>\n",
" <th>Star:</th>\n",
" <th>Complaint Category</th>\n",
" <th>Finding &amp;\n",
"Recommended Discipline</th>\n",
" <th>Unnamed: 11</th>\n",
" <th>Final Finding &amp; \n",
"Discipline</th>\n",
" <th>Unnamed: 13</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>107901</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>107901</td>\n",
" <td>GONZALES, ROBIN</td>\n",
" <td>1981</td>\n",
" <td>F</td>\n",
" <td>S</td>\n",
" <td>2008-04-28</td>\n",
" <td>18.0</td>\n",
" <td>PO</td>\n",
" <td>5137.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>107901</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>108026</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>108026</td>\n",
" <td>BECKER, JOHN</td>\n",
" <td>1970</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>2000-01-24</td>\n",
" <td>17.0</td>\n",
" <td>PO</td>\n",
" <td>4734.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number: Accused: Birth Yr: Gender: Race Code: Date of Appt: \\\n",
"0 107901 NaN NaN NaN NaN NaT \n",
"1 107901 GONZALES, ROBIN 1981 F S 2008-04-28 \n",
"2 107901 NaN end of record NaN NaN NaT \n",
"3 108026 NaN NaN NaN NaN NaT \n",
"4 108026 BECKER, JOHN 1970 M WHI 2000-01-24 \n",
"\n",
" Current Unit: Current Rank: Star: Complaint Category \\\n",
"0 NaN NaN NaN NaN \n",
"1 18.0 PO 5137.0 NaN \n",
"2 NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN \n",
"4 17.0 PO 4734.0 NaN \n",
"\n",
" Finding &\\nRecommended Discipline Unnamed: 11 Final Finding & \\nDiscipline \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"\n",
" Unnamed: 13 \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Number:'].fillna(method='ffill', inplace=True)\n",
"df['Number:'] = df['Number:'].astype(int)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number:</th>\n",
" <th>Accused:</th>\n",
" <th>Birth Yr:</th>\n",
" <th>Gender:</th>\n",
" <th>Race Code:</th>\n",
" <th>Date of Appt:</th>\n",
" <th>Current Unit:</th>\n",
" <th>Current Rank:</th>\n",
" <th>Star:</th>\n",
" <th>Complaint Category</th>\n",
" <th>Finding &amp;\n",
"Recommended Discipline</th>\n",
" <th>Unnamed: 11</th>\n",
" <th>Final Finding &amp; \n",
"Discipline</th>\n",
" <th>Unnamed: 13</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>107901</td>\n",
" <td>GONZALES, ROBIN</td>\n",
" <td>1981</td>\n",
" <td>F</td>\n",
" <td>S</td>\n",
" <td>2008-04-28</td>\n",
" <td>18.0</td>\n",
" <td>PO</td>\n",
" <td>5137.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>108026</td>\n",
" <td>BECKER, JOHN</td>\n",
" <td>1970</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>2000-01-24</td>\n",
" <td>17.0</td>\n",
" <td>PO</td>\n",
" <td>4734.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>108026</td>\n",
" <td>MARKHAM, SEAN</td>\n",
" <td>1971</td>\n",
" <td>M</td>\n",
" <td>I</td>\n",
" <td>2000-06-19</td>\n",
" <td>17.0</td>\n",
" <td>PO</td>\n",
" <td>19054.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1038595</td>\n",
" <td>WELLS, OTIS</td>\n",
" <td>1967</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>2007-04-02</td>\n",
" <td>6.0</td>\n",
" <td>PO</td>\n",
" <td>5385.0</td>\n",
" <td>05D-NO ARREST</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1038595</td>\n",
" <td>MCCLAY, CHARLES</td>\n",
" <td>1983</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>2007-10-29</td>\n",
" <td>3.0</td>\n",
" <td>PO</td>\n",
" <td>4735.0</td>\n",
" <td>05D-NO ARREST</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number: Accused: Birth Yr: Gender: Race Code: Date of Appt: \\\n",
"1 107901 GONZALES, ROBIN 1981 F S 2008-04-28 \n",
"4 108026 BECKER, JOHN 1970 M WHI 2000-01-24 \n",
"5 108026 MARKHAM, SEAN 1971 M I 2000-06-19 \n",
"8 1038595 WELLS, OTIS 1967 M BLK 2007-04-02 \n",
"9 1038595 MCCLAY, CHARLES 1983 M BLK 2007-10-29 \n",
"\n",
" Current Unit: Current Rank: Star: Complaint Category \\\n",
"1 18.0 PO 5137.0 NaN \n",
"4 17.0 PO 4734.0 NaN \n",
"5 17.0 PO 19054.0 NaN \n",
"8 6.0 PO 5385.0 05D-NO ARREST \n",
"9 3.0 PO 4735.0 05D-NO ARREST \n",
"\n",
" Finding &\\nRecommended Discipline Unnamed: 11 Final Finding & \\nDiscipline \\\n",
"1 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"5 NaN NaN NaN \n",
"8 EX 600.0 EX \n",
"9 EX 600.0 EX \n",
"\n",
" Unnamed: 13 \n",
"1 NaN \n",
"4 NaN \n",
"5 NaN \n",
"8 600.0 \n",
"9 600.0 "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subset = df.columns.tolist()[1:]\n",
"df.dropna(subset=subset, how='all', inplace=True)\n",
"df = df[df['Birth Yr:'] != 'end of record']\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Name</th>\n",
" <th>Birth_Yr</th>\n",
" <th>Gender</th>\n",
" <th>Race_Code</th>\n",
" <th>Date_of_Appt</th>\n",
" <th>Current_Unit</th>\n",
" <th>Current_Rank</th>\n",
" <th>Star</th>\n",
" <th>Complaint_Category</th>\n",
" <th>Finding</th>\n",
" <th>Recommended_Discipline</th>\n",
" <th>Final_Finding</th>\n",
" <th>Discipline</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>107901</td>\n",
" <td>GONZALES, ROBIN</td>\n",
" <td>1981</td>\n",
" <td>F</td>\n",
" <td>S</td>\n",
" <td>2008-04-28</td>\n",
" <td>18.0</td>\n",
" <td>PO</td>\n",
" <td>5137.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>108026</td>\n",
" <td>BECKER, JOHN</td>\n",
" <td>1970</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>2000-01-24</td>\n",
" <td>17.0</td>\n",
" <td>PO</td>\n",
" <td>4734.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>108026</td>\n",
" <td>MARKHAM, SEAN</td>\n",
" <td>1971</td>\n",
" <td>M</td>\n",
" <td>I</td>\n",
" <td>2000-06-19</td>\n",
" <td>17.0</td>\n",
" <td>PO</td>\n",
" <td>19054.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1038595</td>\n",
" <td>WELLS, OTIS</td>\n",
" <td>1967</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>2007-04-02</td>\n",
" <td>6.0</td>\n",
" <td>PO</td>\n",
" <td>5385.0</td>\n",
" <td>05D-NO ARREST</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1038595</td>\n",
" <td>MCCLAY, CHARLES</td>\n",
" <td>1983</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>2007-10-29</td>\n",
" <td>3.0</td>\n",
" <td>PO</td>\n",
" <td>4735.0</td>\n",
" <td>05D-NO ARREST</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Name Birth_Yr Gender Race_Code Date_of_Appt \\\n",
"1 107901 GONZALES, ROBIN 1981 F S 2008-04-28 \n",
"4 108026 BECKER, JOHN 1970 M WHI 2000-01-24 \n",
"5 108026 MARKHAM, SEAN 1971 M I 2000-06-19 \n",
"8 1038595 WELLS, OTIS 1967 M BLK 2007-04-02 \n",
"9 1038595 MCCLAY, CHARLES 1983 M BLK 2007-10-29 \n",
"\n",
" Current_Unit Current_Rank Star Complaint_Category Finding \\\n",
"1 18.0 PO 5137.0 NaN NaN \n",
"4 17.0 PO 4734.0 NaN NaN \n",
"5 17.0 PO 19054.0 NaN NaN \n",
"8 6.0 PO 5385.0 05D-NO ARREST EX \n",
"9 3.0 PO 4735.0 05D-NO ARREST EX \n",
"\n",
" Recommended_Discipline Final_Finding Discipline \n",
"1 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"5 NaN NaN NaN \n",
"8 600.0 EX 600.0 \n",
"9 600.0 EX 600.0 "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns = [col.strip().replace(':', '').replace(' ', '_') for col in df.columns]\n",
"df.rename(columns={'Number': 'Complaint_Number',\n",
" 'Accused': 'Name',\n",
" 'Finding_&\\nRecommended_Discipline': 'Finding',\n",
" 'Unnamed_11': 'Recommended_Discipline',\n",
" 'Final_Finding_&_\\nDiscipline': 'Final_Finding',\n",
" 'Unnamed_13': 'Discipline'}, inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"report2 = df.copy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### P046957 - report 3 - police officer witness data xi.xlsx"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Gender</th>\n",
" <th>Race</th>\n",
" <th>Star</th>\n",
" <th>Birth Year</th>\n",
" <th>Date Appointed</th>\n",
" <th>Unnamed: 6</th>\n",
" <th>Unnamed: 7</th>\n",
" <th>Unnamed: 8</th>\n",
" <th>Unnamed: 9</th>\n",
" <th>Unnamed: 10</th>\n",
" <th>Unnamed: 11</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Number:</td>\n",
" <td>1053502</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>HARRIS, KAL</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>14236</td>\n",
" <td>1974.0</td>\n",
" <td>2001-05-29</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Number:</td>\n",
" <td>1053509</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>BUKOWSKIBUS, GEORGE</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>11982</td>\n",
" <td>1966.0</td>\n",
" <td>1996-12-02</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Gender Race Star Birth Year \\\n",
"0 Number: 1053502 NaN NaN NaN \n",
"1 HARRIS, KAL M BLK 14236 1974.0 \n",
"2 NaN NaN end of record NaN NaN \n",
"3 Number: 1053509 NaN NaN NaN \n",
"4 BUKOWSKIBUS, GEORGE M WHI 11982 1966.0 \n",
"\n",
" Date Appointed Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 \\\n",
"0 NaT NaN NaN NaN NaN NaN \n",
"1 2001-05-29 NaN NaN NaN NaN NaN \n",
"2 NaT NaN NaN NaN NaN NaN \n",
"3 NaT NaN NaN NaN NaN NaN \n",
"4 1996-12-02 NaN NaN NaN NaN NaN \n",
"\n",
" Unnamed: 11 \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_excel(files[2], skiprows=9)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df['Complaint_Number'] = df[df['Unnamed: 0']=='Number:']['Gender']"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Gender</th>\n",
" <th>Race</th>\n",
" <th>Star</th>\n",
" <th>Birth Year</th>\n",
" <th>Date Appointed</th>\n",
" <th>Unnamed: 6</th>\n",
" <th>Unnamed: 7</th>\n",
" <th>Unnamed: 8</th>\n",
" <th>Unnamed: 9</th>\n",
" <th>Unnamed: 10</th>\n",
" <th>Unnamed: 11</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1053502</td>\n",
" <td>Number:</td>\n",
" <td>1053502</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>HARRIS, KAL</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>14236</td>\n",
" <td>1974.0</td>\n",
" <td>2001-05-29</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1053509</td>\n",
" <td>Number:</td>\n",
" <td>1053509</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NaN</td>\n",
" <td>BUKOWSKIBUS, GEORGE</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>11982</td>\n",
" <td>1966.0</td>\n",
" <td>1996-12-02</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Unnamed: 0 Gender Race Star \\\n",
"0 1053502 Number: 1053502 NaN NaN \n",
"1 NaN HARRIS, KAL M BLK 14236 \n",
"2 NaN NaN NaN end of record NaN \n",
"3 1053509 Number: 1053509 NaN NaN \n",
"4 NaN BUKOWSKIBUS, GEORGE M WHI 11982 \n",
"\n",
" Birth Year Date Appointed Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 \\\n",
"0 NaN NaT NaN NaN NaN NaN \n",
"1 1974.0 2001-05-29 NaN NaN NaN NaN \n",
"2 NaN NaT NaN NaN NaN NaN \n",
"3 NaN NaT NaN NaN NaN NaN \n",
"4 1966.0 1996-12-02 NaN NaN NaN NaN \n",
"\n",
" Unnamed: 10 Unnamed: 11 \n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN "
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cols = ['Complaint_Number'] + [col for col in df.columns if col != 'Complaint_Number']\n",
"df = df[cols].copy()\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Gender</th>\n",
" <th>Race</th>\n",
" <th>Star</th>\n",
" <th>Birth Year</th>\n",
" <th>Date Appointed</th>\n",
" <th>Unnamed: 6</th>\n",
" <th>Unnamed: 7</th>\n",
" <th>Unnamed: 8</th>\n",
" <th>Unnamed: 9</th>\n",
" <th>Unnamed: 10</th>\n",
" <th>Unnamed: 11</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1053502</td>\n",
" <td>HARRIS, KAL</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>14236</td>\n",
" <td>1974.0</td>\n",
" <td>2001-05-29</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1053509</td>\n",
" <td>BUKOWSKIBUS, GEORGE</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>11982</td>\n",
" <td>1966.0</td>\n",
" <td>1996-12-02</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1053509</td>\n",
" <td>HEINICHEN, WALTER</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>410</td>\n",
" <td>1968.0</td>\n",
" <td>1996-12-02</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1053509</td>\n",
" <td>LAMEKA, MARGARET</td>\n",
" <td>F</td>\n",
" <td>WHI</td>\n",
" <td>NaN</td>\n",
" <td>1958.0</td>\n",
" <td>1986-06-16</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1053545</td>\n",
" <td>SCALES, MONICA</td>\n",
" <td>F</td>\n",
" <td>BLK</td>\n",
" <td>15769</td>\n",
" <td>1968.0</td>\n",
" <td>2006-09-25</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Unnamed: 0 Gender Race Star Birth Year \\\n",
"1 1053502 HARRIS, KAL M BLK 14236 1974.0 \n",
"4 1053509 BUKOWSKIBUS, GEORGE M WHI 11982 1966.0 \n",
"5 1053509 HEINICHEN, WALTER M WHI 410 1968.0 \n",
"6 1053509 LAMEKA, MARGARET F WHI NaN 1958.0 \n",
"9 1053545 SCALES, MONICA F BLK 15769 1968.0 \n",
"\n",
" Date Appointed Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 \\\n",
"1 2001-05-29 NaN NaN NaN NaN NaN \n",
"4 1996-12-02 NaN NaN NaN NaN NaN \n",
"5 1996-12-02 NaN NaN NaN NaN NaN \n",
"6 1986-06-16 NaN NaN NaN NaN NaN \n",
"9 2006-09-25 NaN NaN NaN NaN NaN \n",
"\n",
" Unnamed: 11 \n",
"1 NaN \n",
"4 NaN \n",
"5 NaN \n",
"6 NaN \n",
"9 NaN "
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.Complaint_Number.fillna(method='ffill', inplace=True)\n",
"df = df[(df['Unnamed: 0'] != 'Number:') & (df['Race'] != 'end of record')]\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 3432 entries, 1 to 6372\n",
"Data columns (total 13 columns):\n",
"Complaint_Number 3432 non-null object\n",
"Unnamed: 0 3432 non-null object\n",
"Gender 3432 non-null object\n",
"Race 3432 non-null object\n",
"Star 3220 non-null object\n",
"Birth Year 3415 non-null float64\n",
"Date Appointed 3415 non-null datetime64[ns]\n",
"Unnamed: 6 0 non-null float64\n",
"Unnamed: 7 0 non-null float64\n",
"Unnamed: 8 0 non-null float64\n",
"Unnamed: 9 0 non-null float64\n",
"Unnamed: 10 0 non-null float64\n",
"Unnamed: 11 0 non-null float64\n",
"dtypes: datetime64[ns](1), float64(7), object(5)\n",
"memory usage: 375.4+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.drop(['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', \n",
" 'Unnamed: 10', 'Unnamed: 11'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 3432 entries, 1 to 6372\n",
"Data columns (total 7 columns):\n",
"Complaint_Number 3432 non-null object\n",
"Unnamed: 0 3432 non-null object\n",
"Gender 3432 non-null object\n",
"Race 3432 non-null object\n",
"Star 3220 non-null object\n",
"Birth Year 3415 non-null float64\n",
"Date Appointed 3415 non-null datetime64[ns]\n",
"dtypes: datetime64[ns](1), float64(1), object(5)\n",
"memory usage: 214.5+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Name</th>\n",
" <th>Gender</th>\n",
" <th>Race</th>\n",
" <th>Star</th>\n",
" <th>Birth Year</th>\n",
" <th>Date Appointed</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1053502</td>\n",
" <td>HARRIS, KAL</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>14236</td>\n",
" <td>1974.0</td>\n",
" <td>2001-05-29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1053509</td>\n",
" <td>BUKOWSKIBUS, GEORGE</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>11982</td>\n",
" <td>1966.0</td>\n",
" <td>1996-12-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1053509</td>\n",
" <td>HEINICHEN, WALTER</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>410</td>\n",
" <td>1968.0</td>\n",
" <td>1996-12-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1053509</td>\n",
" <td>LAMEKA, MARGARET</td>\n",
" <td>F</td>\n",
" <td>WHI</td>\n",
" <td>NaN</td>\n",
" <td>1958.0</td>\n",
" <td>1986-06-16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1053545</td>\n",
" <td>SCALES, MONICA</td>\n",
" <td>F</td>\n",
" <td>BLK</td>\n",
" <td>15769</td>\n",
" <td>1968.0</td>\n",
" <td>2006-09-25</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Name Gender Race Star Birth Year \\\n",
"1 1053502 HARRIS, KAL M BLK 14236 1974.0 \n",
"4 1053509 BUKOWSKIBUS, GEORGE M WHI 11982 1966.0 \n",
"5 1053509 HEINICHEN, WALTER M WHI 410 1968.0 \n",
"6 1053509 LAMEKA, MARGARET F WHI NaN 1958.0 \n",
"9 1053545 SCALES, MONICA F BLK 15769 1968.0 \n",
"\n",
" Date Appointed \n",
"1 2001-05-29 \n",
"4 1996-12-02 \n",
"5 1996-12-02 \n",
"6 1986-06-16 \n",
"9 2006-09-25 "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rename(columns={'Unnamed: 0': 'Name'}, inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"report3 = df.copy()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### P046957 - report 4 - victim data.xlsx"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Unnamed: 1</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Unnamed: 4</th>\n",
" <th>Race Desc</th>\n",
" <th>Unnamed: 6</th>\n",
" <th>Unnamed: 7</th>\n",
" <th>Unnamed: 8</th>\n",
" <th>Unnamed: 9</th>\n",
" <th>Unnamed: 10</th>\n",
" <th>Unnamed: 11</th>\n",
" <th>Unnamed: 12</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1039179.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>F</td>\n",
" <td>25.832877</td>\n",
" <td>NaN</td>\n",
" <td>BLACK</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>M</td>\n",
" <td>38.000000</td>\n",
" <td>NaN</td>\n",
" <td>BLACK</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1053502.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Unnamed: 1 Gender Age Unnamed: 4 Race Desc \\\n",
"0 1039179.0 NaN NaN NaN NaN NaN \n",
"1 NaN NaN F 25.832877 NaN BLACK \n",
"2 NaN NaN M 38.000000 NaN BLACK \n",
"3 NaN NaN NaN NaN NaN end of record \n",
"4 1053502.0 NaN NaN NaN NaN NaN \n",
"\n",
" Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 \\\n",
"0 NaN NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN NaN NaN \n",
"\n",
" Unnamed: 12 \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_excel(files[3], skiprows=7)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 19821 entries, 0 to 19820\n",
"Data columns (total 13 columns):\n",
"Number 6271 non-null float64\n",
"Unnamed: 1 0 non-null float64\n",
"Gender 7279 non-null object\n",
"Age 5889 non-null float64\n",
"Unnamed: 4 0 non-null float64\n",
"Race Desc 13550 non-null object\n",
"Unnamed: 6 0 non-null float64\n",
"Unnamed: 7 0 non-null float64\n",
"Unnamed: 8 0 non-null float64\n",
"Unnamed: 9 0 non-null float64\n",
"Unnamed: 10 0 non-null float64\n",
"Unnamed: 11 0 non-null float64\n",
"Unnamed: 12 0 non-null float64\n",
"dtypes: float64(11), object(2)\n",
"memory usage: 2.0+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1039179.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>F</td>\n",
" <td>25.832877</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>M</td>\n",
" <td>38.000000</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1053502.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"0 1039179.0 NaN NaN NaN\n",
"1 NaN F 25.832877 BLACK\n",
"2 NaN M 38.000000 BLACK\n",
"3 NaN NaN NaN end of record\n",
"4 1053502.0 NaN NaN NaN"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dropna(axis=1, how='all', inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1039179</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>25.832877</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>38.000000</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1039179</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1053502</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"0 1039179 NaN NaN NaN\n",
"1 1039179 F 25.832877 BLACK\n",
"2 1039179 M 38.000000 BLACK\n",
"3 1039179 NaN NaN end of record\n",
"4 1053502 NaN NaN NaN"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Number'].fillna(method='ffill', inplace=True)\n",
"df['Number'] = df['Number'].astype(int)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>25.832877</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>38.000000</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1039179</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1053502</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1053502</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"1 1039179 F 25.832877 BLACK\n",
"2 1039179 M 38.000000 BLACK\n",
"3 1039179 NaN NaN end of record\n",
"5 1053502 F NaN BLACK\n",
"6 1053502 NaN NaN end of record"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subset = df.columns.tolist()[1:]\n",
"df.dropna(subset=subset, how='all', inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>25.832877</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>38.000000</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1053502</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1053505</td>\n",
" <td>F</td>\n",
" <td>18.378082</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>1053509</td>\n",
" <td>F</td>\n",
" <td>42.747945</td>\n",
" <td>WHITE HISPANIC</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"1 1039179 F 25.832877 BLACK\n",
"2 1039179 M 38.000000 BLACK\n",
"5 1053502 F NaN BLACK\n",
"8 1053505 F 18.378082 BLACK\n",
"11 1053509 F 42.747945 WHITE HISPANIC"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df[df['Race Desc'] != 'end of record']\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race_Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>25.832877</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>38.000000</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1053502</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1053505</td>\n",
" <td>F</td>\n",
" <td>18.378082</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>1053509</td>\n",
" <td>F</td>\n",
" <td>42.747945</td>\n",
" <td>WHITE HISPANIC</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Gender Age Race_Desc\n",
"1 1039179 F 25.832877 BLACK\n",
"2 1039179 M 38.000000 BLACK\n",
"5 1053502 F NaN BLACK\n",
"8 1053505 F 18.378082 BLACK\n",
"11 1053509 F 42.747945 WHITE HISPANIC"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rename(columns={'Number': 'Complaint_Number', 'Race Desc': 'Race_Desc'}, inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"report4 = df.copy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### P046957 - report 5 - complainant (reporting party) data"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Unnamed: 1</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Unnamed: 4</th>\n",
" <th>Race Desc</th>\n",
" <th>Unnamed: 6</th>\n",
" <th>Unnamed: 7</th>\n",
" <th>Unnamed: 8</th>\n",
" <th>Unnamed: 9</th>\n",
" <th>Unnamed: 10</th>\n",
" <th>Unnamed: 11</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1038595.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>F</td>\n",
" <td>63.805479</td>\n",
" <td>NaN</td>\n",
" <td>WHITE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1039179.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>M</td>\n",
" <td>50.449315</td>\n",
" <td>NaN</td>\n",
" <td>BLACK</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Unnamed: 1 Gender Age Unnamed: 4 Race Desc \\\n",
"0 1038595.0 NaN NaN NaN NaN NaN \n",
"1 NaN NaN F 63.805479 NaN WHITE \n",
"2 NaN NaN NaN NaN NaN end of record \n",
"3 1039179.0 NaN NaN NaN NaN NaN \n",
"4 NaN NaN M 50.449315 NaN BLACK \n",
"\n",
" Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 \n",
"0 NaN NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN NaN NaN "
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_excel(files[4], skiprows=7)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 39917 entries, 0 to 39916\n",
"Data columns (total 12 columns):\n",
"Number 13238 non-null float64\n",
"Unnamed: 1 0 non-null float64\n",
"Gender 13441 non-null object\n",
"Age 11675 non-null float64\n",
"Unnamed: 4 0 non-null float64\n",
"Race Desc 26679 non-null object\n",
"Unnamed: 6 0 non-null float64\n",
"Unnamed: 7 0 non-null float64\n",
"Unnamed: 8 0 non-null float64\n",
"Unnamed: 9 0 non-null float64\n",
"Unnamed: 10 0 non-null float64\n",
"Unnamed: 11 0 non-null float64\n",
"dtypes: float64(10), object(2)\n",
"memory usage: 3.7+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1038595.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>F</td>\n",
" <td>63.805479</td>\n",
" <td>WHITE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1039179.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NaN</td>\n",
" <td>M</td>\n",
" <td>50.449315</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"0 1038595.0 NaN NaN NaN\n",
"1 NaN F 63.805479 WHITE\n",
"2 NaN NaN NaN end of record\n",
"3 1039179.0 NaN NaN NaN\n",
"4 NaN M 50.449315 BLACK"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dropna(axis=1, how='all', inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1038595</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1038595</td>\n",
" <td>F</td>\n",
" <td>63.805479</td>\n",
" <td>WHITE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1038595</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1039179</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>50.449315</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"0 1038595 NaN NaN NaN\n",
"1 1038595 F 63.805479 WHITE\n",
"2 1038595 NaN NaN end of record\n",
"3 1039179 NaN NaN NaN\n",
"4 1039179 M 50.449315 BLACK"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Number'].fillna(method='ffill', inplace=True)\n",
"df['Number'] = df['Number'].astype(int)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1038595</td>\n",
" <td>F</td>\n",
" <td>63.805479</td>\n",
" <td>WHITE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1038595</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>50.449315</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>51.391781</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>34.641096</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"1 1038595 F 63.805479 WHITE\n",
"2 1038595 NaN NaN end of record\n",
"4 1039179 M 50.449315 BLACK\n",
"5 1039179 M 51.391781 BLACK\n",
"6 1039179 F 34.641096 BLACK"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subset = df.columns.tolist()[1:]\n",
"df.dropna(subset=subset, how='all', inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1038595</td>\n",
" <td>F</td>\n",
" <td>63.805479</td>\n",
" <td>WHITE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>50.449315</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>51.391781</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>34.641096</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1053492</td>\n",
" <td>F</td>\n",
" <td>51.364384</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"1 1038595 F 63.805479 WHITE\n",
"4 1039179 M 50.449315 BLACK\n",
"5 1039179 M 51.391781 BLACK\n",
"6 1039179 F 34.641096 BLACK\n",
"9 1053492 F 51.364384 BLACK"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df[df['Race Desc'] != 'end of record']\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race_Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1038595</td>\n",
" <td>F</td>\n",
" <td>63.805479</td>\n",
" <td>WHITE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>50.449315</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>51.391781</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>34.641096</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1053492</td>\n",
" <td>F</td>\n",
" <td>51.364384</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Gender Age Race_Desc\n",
"1 1038595 F 63.805479 WHITE\n",
"4 1039179 M 50.449315 BLACK\n",
"5 1039179 M 51.391781 BLACK\n",
"6 1039179 F 34.641096 BLACK\n",
"9 1053492 F 51.364384 BLACK"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rename(columns={'Number': 'Complaint_Number', 'Race Desc': 'Race_Desc'}, inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"report5 = df.copy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Merging datasets"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Beat', 'Location_Code', 'Address', 'Street',\n",
" 'Apartment', 'City_State_Zipcode', 'Incident_Datetime',\n",
" 'Complaint_Date', 'Closed_Date', 'Investigator_Name',\n",
" 'Investigator_Current_Assignment', 'Investigator_Rank',\n",
" 'Investigator_Star', 'Investigator_Appointed_Date'],\n",
" dtype='object')"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report1.columns"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Name', 'Birth_Yr', 'Gender', 'Race_Code',\n",
" 'Date_of_Appt', 'Current_Unit', 'Current_Rank', 'Star',\n",
" 'Complaint_Category', 'Finding', 'Recommended_Discipline',\n",
" 'Final_Finding', 'Discipline'],\n",
" dtype='object')"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report2.columns"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Name', 'Gender', 'Race', 'Star', 'Birth Year',\n",
" 'Date Appointed'],\n",
" dtype='object')"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report3.columns"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Gender', 'Age', 'Race_Desc'], dtype='object')"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report4.columns"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Gender', 'Age', 'Race_Desc'], dtype='object')"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report5.columns"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"report1.rename(columns={'Investigator_Appointed_Date': 'Investigator_Date_Appointed'})\n",
"report2.rename(columns={'Date_of_Appt': 'Date_Appointed'})\n",
"\n",
"cols = ['Accused_' + col.strip().replace(' ', '_') for col in report2.columns[1:]]\n",
"report2.columns = ['Complaint_Number'] + cols\n",
"\n",
"cols = ['PO_Witness_' + col.strip().replace(' ', '_') for col in report3.columns[1:]]\n",
"report3.columns = ['Complaint_Number'] + cols\n",
"\n",
"cols = ['Victim_' + col.strip().replace(' ', '_') for col in report4.columns[1:]]\n",
"report4.columns = ['Complaint_Number'] + cols\n",
"\n",
"cols = ['Complainant_' + col.strip().replace(' ', '_') for col in report5.columns[1:]]\n",
"report5.columns = ['Complaint_Number'] + cols"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Beat', 'Location_Code', 'Address', 'Street',\n",
" 'Apartment', 'City_State_Zipcode', 'Incident_Datetime',\n",
" 'Complaint_Date', 'Closed_Date', 'Investigator_Name',\n",
" 'Investigator_Current_Assignment', 'Investigator_Rank',\n",
" 'Investigator_Star', 'Investigator_Appointed_Date'],\n",
" dtype='object')"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report1.columns"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Accused_Name', 'Accused_Birth_Yr',\n",
" 'Accused_Gender', 'Accused_Race_Code', 'Accused_Date_of_Appt',\n",
" 'Accused_Current_Unit', 'Accused_Current_Rank', 'Accused_Star',\n",
" 'Accused_Complaint_Category', 'Accused_Finding',\n",
" 'Accused_Recommended_Discipline', 'Accused_Final_Finding',\n",
" 'Accused_Discipline'],\n",
" dtype='object')"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report2.columns"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'PO_Witness_Name', 'PO_Witness_Gender',\n",
" 'PO_Witness_Race', 'PO_Witness_Star', 'PO_Witness_Birth_Year',\n",
" 'PO_Witness_Date_Appointed'],\n",
" dtype='object')"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report3.columns"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Victim_Gender', 'Victim_Age', 'Victim_Race_Desc'], dtype='object')"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report4.columns"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Complainant_Gender', 'Complainant_Age',\n",
" 'Complainant_Race_Desc'],\n",
" dtype='object')"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report5.columns"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"merged = pd.merge(report1, report2, on='Complaint_Number', how='outer')\n",
"merged = pd.merge(merged, report3, on='Complaint_Number', how='outer')\n",
"merged = pd.merge(merged, report4, on='Complaint_Number', how='outer')\n",
"merged = pd.merge(merged, report5, on='Complaint_Number', how='outer')"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 31597 entries, 0 to 31596\n",
"Data columns (total 40 columns):\n",
"Complaint_Number 31597 non-null object\n",
"Beat 26935 non-null object\n",
"Location_Code 27149 non-null object\n",
"Address 19684 non-null object\n",
"Street 20110 non-null object\n",
"Apartment 1530 non-null object\n",
"City_State_Zipcode 21396 non-null object\n",
"Incident_Datetime 27151 non-null datetime64[ns]\n",
"Complaint_Date 27151 non-null datetime64[ns]\n",
"Closed_Date 24203 non-null datetime64[ns]\n",
"Investigator_Name 27753 non-null object\n",
"Investigator_Current_Assignment 27753 non-null object\n",
"Investigator_Rank 27753 non-null object\n",
"Investigator_Star 19661 non-null object\n",
"Investigator_Appointed_Date 26836 non-null object\n",
"Accused_Name 21080 non-null object\n",
"Accused_Birth_Yr 21080 non-null object\n",
"Accused_Gender 21080 non-null object\n",
"Accused_Race_Code 21080 non-null object\n",
"Accused_Date_of_Appt 21080 non-null datetime64[ns]\n",
"Accused_Current_Unit 21080 non-null float64\n",
"Accused_Current_Rank 20965 non-null object\n",
"Accused_Star 19882 non-null float64\n",
"Accused_Complaint_Category 19346 non-null object\n",
"Accused_Finding 7390 non-null object\n",
"Accused_Recommended_Discipline 17398 non-null float64\n",
"Accused_Final_Finding 6923 non-null object\n",
"Accused_Discipline 16912 non-null float64\n",
"PO_Witness_Name 3432 non-null object\n",
"PO_Witness_Gender 3432 non-null object\n",
"PO_Witness_Race 3432 non-null object\n",
"PO_Witness_Star 3220 non-null object\n",
"PO_Witness_Birth_Year 3415 non-null float64\n",
"PO_Witness_Date_Appointed 3415 non-null datetime64[ns]\n",
"Victim_Gender 12554 non-null object\n",
"Victim_Age 10041 non-null float64\n",
"Victim_Race_Desc 12554 non-null object\n",
"Complainant_Gender 22595 non-null object\n",
"Complainant_Age 19068 non-null float64\n",
"Complainant_Race_Desc 22595 non-null object\n",
"dtypes: datetime64[ns](5), float64(7), object(28)\n",
"memory usage: 9.9+ MB\n"
]
}
],
"source": [
"merged.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Saving everything"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"report1.to_csv('./Clean/June2016/report1_all_complaints.csv')\n",
"report2.to_csv('./Clean/June2016/report2_accused.csv')\n",
"report3.to_csv('./Clean/June2016/report3_po_witness.csv')\n",
"report4.to_csv('./Clean/June2016/report4_victim.csv')\n",
"report5.to_csv('./Clean/June2016/report5_complainant.csv')\n",
"merged.to_csv('./Clean/June2016/june2016_all.csv')\n",
"\n",
"report1.to_excel('./Clean/June2016/report1_all_complaints.xlsx')\n",
"report2.to_excel('./Clean/June2016/report2_accused.xlsx')\n",
"report3.to_excel('./Clean/June2016/report3_po_witness.xlsx')\n",
"report4.to_excel('./Clean/June2016/report4_victim.xlsx')\n",
"report5.to_excel('./Clean/June2016/report5_complainant.xlsx')\n",
"merged.to_excel('./Clean/June2016/june2016_all.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment