Skip to content

Instantly share code, notes, and snippets.

@DGalt
Created July 7, 2016 03:06
Show Gist options
  • Save DGalt/6e0fdee3edf75b1509c09e9ae9c0e5d6 to your computer and use it in GitHub Desktop.
Save DGalt/6e0fdee3edf75b1509c09e9ae9c0e5d6 to your computer and use it in GitHub Desktop.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from glob import glob\n",
"import csv\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['./Raw/raw dump June 2016/P046957 - report 1 - all complaints in time frame.xlsx',\n",
" './Raw/raw dump June 2016/P046957 - report 2 - identified accused xi.xlsx',\n",
" './Raw/raw dump June 2016/P046957 - report 3 - police officer witness data xi.xlsx',\n",
" './Raw/raw dump June 2016/P046957 - report 4 - victim data.xlsx',\n",
" './Raw/raw dump June 2016/P046957 - report 5 - complainant (reporting party) data.xlsx']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files = sorted(glob('./Raw/raw dump June 2016/*.xlsx'))\n",
"files"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### P046957 - report 1 - all complaints in time frame.xlsx\n",
"\n",
"** Questions **\n",
"- For some of the Investigator rows there is a datetime in Unnamed: 6 - what is this?\n",
"- What is the column of values labeled as \"Unnamed: 5\" for the Investigator"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.read_excel(files[0], skiprows=9)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"df.dropna(how='all', inplace=True)\n",
"df = df[df['Location Code:'] != 'end of record'].copy()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df['Number:'].fillna(method='ffill', inplace=True)\n",
"df['Number:'] = df['Number:'].astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"invest_df = df[df['Beat:'] == 'Investigator with Current Assignment and Rank:'].copy()\n",
"df = df[df['Beat:'] != 'Investigator with Current Assignment and Rank:'].copy()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 17045 entries, 0 to 50111\n",
"Data columns (total 13 columns):\n",
"Number: 17045 non-null int64\n",
"Beat: 16366 non-null object\n",
"Location Code: 16529 non-null object\n",
"Address of Incident: 12628 non-null object\n",
"Unnamed: 4 12877 non-null object\n",
"Unnamed: 5 824 non-null object\n",
"Unnamed: 6 13615 non-null object\n",
"Incident Date & Time 16531 non-null datetime64[ns]\n",
"Complaint Date 16531 non-null datetime64[ns]\n",
"Closed Date 14753 non-null datetime64[ns]\n",
"Unnamed: 10 0 non-null float64\n",
"Unnamed: 11 0 non-null float64\n",
"Unnamed: 12 0 non-null float64\n",
"dtypes: datetime64[ns](3), float64(3), int64(1), object(6)\n",
"memory usage: 1.8+ MB\n"
]
}
],
"source": [
"df = df.replace('----', float('nan')).replace('-----', float('nan'))\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.drop(['Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number:</th>\n",
" <th>Beat:</th>\n",
" <th>Location Code:</th>\n",
" <th>Address of Incident:</th>\n",
" <th>Unnamed: 4</th>\n",
" <th>Unnamed: 5</th>\n",
" <th>Unnamed: 6</th>\n",
" <th>Incident Date &amp; Time</th>\n",
" <th>Complaint Date</th>\n",
" <th>Closed Date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>106213</td>\n",
" <td>1631</td>\n",
" <td>17</td>\n",
" <td>3700</td>\n",
" <td>N HARLEM AVE</td>\n",
" <td>NaN</td>\n",
" <td>CHICAGO IL 60634</td>\n",
" <td>2015-07-19 21:00:00</td>\n",
" <td>2015-07-20</td>\n",
" <td>2015-09-21</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number: Beat: Location Code: Address of Incident: Unnamed: 4 Unnamed: 5 \\\n",
"0 106213 1631 17 3700 N HARLEM AVE NaN \n",
"\n",
" Unnamed: 6 Incident Date & Time Complaint Date Closed Date \n",
"0 CHICAGO IL 60634 2015-07-19 21:00:00 2015-07-20 2015-09-21 "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Beat</th>\n",
" <th>Location_Code</th>\n",
" <th>Address</th>\n",
" <th>Street</th>\n",
" <th>Apartment</th>\n",
" <th>City_State_Zipcode</th>\n",
" <th>Incident_Datetime</th>\n",
" <th>Complaint_Date</th>\n",
" <th>Closed_Date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>106213</td>\n",
" <td>1631</td>\n",
" <td>17</td>\n",
" <td>3700</td>\n",
" <td>N HARLEM AVE</td>\n",
" <td>NaN</td>\n",
" <td>CHICAGO IL 60634</td>\n",
" <td>2015-07-19 21:00:00</td>\n",
" <td>2015-07-20</td>\n",
" <td>2015-09-21</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Beat Location_Code Address Street Apartment \\\n",
"0 106213 1631 17 3700 N HARLEM AVE NaN \n",
"\n",
" City_State_Zipcode Incident_Datetime Complaint_Date Closed_Date \n",
"0 CHICAGO IL 60634 2015-07-19 21:00:00 2015-07-20 2015-09-21 "
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns = ['Complaint_Number', 'Beat', 'Location_Code', 'Address', 'Street', 'Apartment', \n",
" 'City_State_Zipcode', 'Incident_Datetime', 'Complaint_Date', 'Closed_Date']\n",
"df.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 16404 entries, 1 to 50085\n",
"Data columns (total 13 columns):\n",
"Number: 16404 non-null int64\n",
"Beat: 16404 non-null object\n",
"Location Code: 16278 non-null object\n",
"Address of Incident: 16278 non-null object\n",
"Unnamed: 4 16278 non-null object\n",
"Unnamed: 5 11228 non-null object\n",
"Unnamed: 6 15714 non-null object\n",
"Incident Date & Time 0 non-null datetime64[ns]\n",
"Complaint Date 0 non-null datetime64[ns]\n",
"Closed Date 0 non-null datetime64[ns]\n",
"Unnamed: 10 0 non-null float64\n",
"Unnamed: 11 0 non-null float64\n",
"Unnamed: 12 0 non-null float64\n",
"dtypes: datetime64[ns](3), float64(3), int64(1), object(6)\n",
"memory usage: 1.8+ MB\n"
]
}
],
"source": [
"invest_df.info()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"invest_df.drop(['Beat:', 'Incident Date & Time', 'Closed Date', 'Complaint Date', \n",
" 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number:</th>\n",
" <th>Location Code:</th>\n",
" <th>Address of Incident:</th>\n",
" <th>Unnamed: 4</th>\n",
" <th>Unnamed: 5</th>\n",
" <th>Unnamed: 6</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>106213</td>\n",
" <td>KLIMAS, ROBERT</td>\n",
" <td>121</td>\n",
" <td>COMMANDER</td>\n",
" <td>0</td>\n",
" <td>2008-08-04 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1038595</td>\n",
" <td>DAUN, SHERRY</td>\n",
" <td>113</td>\n",
" <td>SUPERVISING INV IPRA</td>\n",
" <td>NaN</td>\n",
" <td>2008-12-01 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>1039179</td>\n",
" <td>JONES, VINCENT</td>\n",
" <td>113</td>\n",
" <td>INVESTIGATOR 2 IPRA</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>1053492</td>\n",
" <td>MC GUIRE, TERRENCE</td>\n",
" <td>003</td>\n",
" <td>LIEUTENANT OF POLICE</td>\n",
" <td>376</td>\n",
" <td>1991-11-18 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>1053493</td>\n",
" <td>BREIMON, GERALD</td>\n",
" <td>014</td>\n",
" <td>SERGEANT OF POLICE</td>\n",
" <td>869</td>\n",
" <td>1993-11-22 00:00:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number: Location Code: Address of Incident: Unnamed: 4 \\\n",
"1 106213 KLIMAS, ROBERT 121 COMMANDER \n",
"16 1038595 DAUN, SHERRY 113 SUPERVISING INV IPRA \n",
"19 1039179 JONES, VINCENT 113 INVESTIGATOR 2 IPRA \n",
"22 1053492 MC GUIRE, TERRENCE 003 LIEUTENANT OF POLICE \n",
"25 1053493 BREIMON, GERALD 014 SERGEANT OF POLICE \n",
"\n",
" Unnamed: 5 Unnamed: 6 \n",
"1 0 2008-08-04 00:00:00 \n",
"16 NaN 2008-12-01 00:00:00 \n",
"19 NaN NaN \n",
"22 376 1991-11-18 00:00:00 \n",
"25 869 1993-11-22 00:00:00 "
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"invest_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"** Unnamed: 5 and 6 **\n",
"\n",
"Looking at the full list of CPD officers it seems that Unnamed: 5 correspond to the officer's star number, while 6 corresponds to his/her appointment date. I wanted to confirm this, though, so I went ahead and looked at the subset of investigator names that appear in the full list of CPD officers"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Both 1043\n",
"One 115\n",
"Neither 46\n",
"dtype: int64"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sworn = pd.read_csv('./all_sworn_officer.csv')\n",
"sworn_names = (sworn.LAST_NME + ', ' + sworn.FIRST_NME).tolist()\n",
"mask = invest_df['Location Code:'].drop_duplicates().apply(lambda x: x in sworn_names)\n",
"matches = invest_df.drop_duplicates(subset='Location Code:')[mask]\n",
"\n",
"#the year value in the APPOINTED_DATE column is ambiguous, so it needs to be fixed\n",
"def fix_time(x):\n",
" split = x.split('-')\n",
" if split[-1][0] == '0' or split[-1][0] == '1':\n",
" split[-1] = '20' + split[-1]\n",
" else:\n",
" split[-1] = '19' + split[-1]\n",
" return '-'.join(split)\n",
"\n",
"sworn.APPOINTED_DATE = pd.to_datetime(sworn.APPOINTED_DATE.astype(str).apply(fix_time), \n",
" errors='coerce')\n",
"matches['Unnamed: 6'] = matches['Unnamed: 6'].dt.normalize()\n",
"\n",
"def which_matches(x):\n",
" lname = x['Location Code:'].values[0].split(', ')[0]\n",
" fname = x['Location Code:'].values[0].split(', ')[1]\n",
" \n",
" sub_sworn = sworn[(sworn.LAST_NME==lname)&(sworn.FIRST_NME==fname)]\n",
" \n",
" star = x['Unnamed: 5'].values[0]\n",
" date = x['Unnamed: 6'].values[0]\n",
" star_matches = False\n",
" date_matches = False\n",
" sworn_stars = ['STAR%s'%num for num in range(1, 11)]\n",
" if star in sub_sworn[sworn_stars].dropna(axis=1).values:\n",
" star_matches = True \n",
" if date == sub_sworn.APPOINTED_DATE.values[0]:\n",
" date_matches = True\n",
" \n",
" if star_matches and date_matches:\n",
" return 'Both'\n",
" elif star_matches or date_matches:\n",
" return \"One\"\n",
" else:\n",
" return \"Neither\"\n",
" \n",
"matches.dropna().groupby(level=0).apply(which_matches).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"invest_df.columns = ['Complaint_Number', 'Investigator_Name', \n",
" 'Investigator_Current_Assignment', 'Investigator_Rank', \n",
" 'Investigator_Star', 'Investigator_Appointed_Date']"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"report1 = pd.merge(df, invest_df, on='Complaint_Number', how='outer')"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### P046957 - report 2 - identified accused xi.xlsx\n",
"\n",
"** Questions **\n",
"- Does \"Unnamed: 11\" and \"Unnamed: 13\" correspond to \"Recommended Discipline\" and \"Discipline\" (columns to the left of those)?"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.read_excel(files[1], skiprows=8)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number:</th>\n",
" <th>Accused:</th>\n",
" <th>Birth Yr:</th>\n",
" <th>Gender:</th>\n",
" <th>Race Code:</th>\n",
" <th>Date of Appt:</th>\n",
" <th>Current Unit:</th>\n",
" <th>Current Rank:</th>\n",
" <th>Star:</th>\n",
" <th>Complaint Category</th>\n",
" <th>Finding &amp;\n",
"Recommended Discipline</th>\n",
" <th>Unnamed: 11</th>\n",
" <th>Final Finding &amp; \n",
"Discipline</th>\n",
" <th>Unnamed: 13</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>107901.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>GONZALES, ROBIN</td>\n",
" <td>1981</td>\n",
" <td>F</td>\n",
" <td>S</td>\n",
" <td>2008-04-28</td>\n",
" <td>18.0</td>\n",
" <td>PO</td>\n",
" <td>5137.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>108026.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NaN</td>\n",
" <td>BECKER, JOHN</td>\n",
" <td>1970</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>2000-01-24</td>\n",
" <td>17.0</td>\n",
" <td>PO</td>\n",
" <td>4734.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number: Accused: Birth Yr: Gender: Race Code: \\\n",
"0 107901.0 NaN NaN NaN NaN \n",
"1 NaN GONZALES, ROBIN 1981 F S \n",
"2 NaN NaN end of record NaN NaN \n",
"3 108026.0 NaN NaN NaN NaN \n",
"4 NaN BECKER, JOHN 1970 M WHI \n",
"\n",
" Date of Appt: Current Unit: Current Rank: Star: Complaint Category \\\n",
"0 NaT NaN NaN NaN NaN \n",
"1 2008-04-28 18.0 PO 5137.0 NaN \n",
"2 NaT NaN NaN NaN NaN \n",
"3 NaT NaN NaN NaN NaN \n",
"4 2000-01-24 17.0 PO 4734.0 NaN \n",
"\n",
" Finding &\\nRecommended Discipline Unnamed: 11 Final Finding & \\nDiscipline \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"\n",
" Unnamed: 13 \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 38571 entries, 0 to 38570\n",
"Data columns (total 14 columns):\n",
"Number: 10086 non-null float64\n",
"Accused: 18399 non-null object\n",
"Birth Yr: 28485 non-null object\n",
"Gender: 18399 non-null object\n",
"Race Code: 18399 non-null object\n",
" Date of Appt: 18399 non-null datetime64[ns]\n",
"Current Unit: 18399 non-null float64\n",
"Current Rank: 18297 non-null object\n",
"Star: 17371 non-null float64\n",
"Complaint Category 16945 non-null object\n",
"Finding &\n",
"Recommended Discipline 6295 non-null object\n",
"Unnamed: 11 15234 non-null float64\n",
"Final Finding & \n",
"Discipline 5892 non-null object\n",
"Unnamed: 13 14818 non-null float64\n",
"dtypes: datetime64[ns](1), float64(5), object(8)\n",
"memory usage: 4.1+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number:</th>\n",
" <th>Accused:</th>\n",
" <th>Birth Yr:</th>\n",
" <th>Gender:</th>\n",
" <th>Race Code:</th>\n",
" <th>Date of Appt:</th>\n",
" <th>Current Unit:</th>\n",
" <th>Current Rank:</th>\n",
" <th>Star:</th>\n",
" <th>Complaint Category</th>\n",
" <th>Finding &amp;\n",
"Recommended Discipline</th>\n",
" <th>Unnamed: 11</th>\n",
" <th>Final Finding &amp; \n",
"Discipline</th>\n",
" <th>Unnamed: 13</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>107901</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>107901</td>\n",
" <td>GONZALES, ROBIN</td>\n",
" <td>1981</td>\n",
" <td>F</td>\n",
" <td>S</td>\n",
" <td>2008-04-28</td>\n",
" <td>18.0</td>\n",
" <td>PO</td>\n",
" <td>5137.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>107901</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>108026</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>108026</td>\n",
" <td>BECKER, JOHN</td>\n",
" <td>1970</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>2000-01-24</td>\n",
" <td>17.0</td>\n",
" <td>PO</td>\n",
" <td>4734.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number: Accused: Birth Yr: Gender: Race Code: Date of Appt: \\\n",
"0 107901 NaN NaN NaN NaN NaT \n",
"1 107901 GONZALES, ROBIN 1981 F S 2008-04-28 \n",
"2 107901 NaN end of record NaN NaN NaT \n",
"3 108026 NaN NaN NaN NaN NaT \n",
"4 108026 BECKER, JOHN 1970 M WHI 2000-01-24 \n",
"\n",
" Current Unit: Current Rank: Star: Complaint Category \\\n",
"0 NaN NaN NaN NaN \n",
"1 18.0 PO 5137.0 NaN \n",
"2 NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN \n",
"4 17.0 PO 4734.0 NaN \n",
"\n",
" Finding &\\nRecommended Discipline Unnamed: 11 Final Finding & \\nDiscipline \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"\n",
" Unnamed: 13 \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Number:'].fillna(method='ffill', inplace=True)\n",
"df['Number:'] = df['Number:'].astype(int)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number:</th>\n",
" <th>Accused:</th>\n",
" <th>Birth Yr:</th>\n",
" <th>Gender:</th>\n",
" <th>Race Code:</th>\n",
" <th>Date of Appt:</th>\n",
" <th>Current Unit:</th>\n",
" <th>Current Rank:</th>\n",
" <th>Star:</th>\n",
" <th>Complaint Category</th>\n",
" <th>Finding &amp;\n",
"Recommended Discipline</th>\n",
" <th>Unnamed: 11</th>\n",
" <th>Final Finding &amp; \n",
"Discipline</th>\n",
" <th>Unnamed: 13</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>107901</td>\n",
" <td>GONZALES, ROBIN</td>\n",
" <td>1981</td>\n",
" <td>F</td>\n",
" <td>S</td>\n",
" <td>2008-04-28</td>\n",
" <td>18.0</td>\n",
" <td>PO</td>\n",
" <td>5137.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>108026</td>\n",
" <td>BECKER, JOHN</td>\n",
" <td>1970</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>2000-01-24</td>\n",
" <td>17.0</td>\n",
" <td>PO</td>\n",
" <td>4734.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>108026</td>\n",
" <td>MARKHAM, SEAN</td>\n",
" <td>1971</td>\n",
" <td>M</td>\n",
" <td>I</td>\n",
" <td>2000-06-19</td>\n",
" <td>17.0</td>\n",
" <td>PO</td>\n",
" <td>19054.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1038595</td>\n",
" <td>WELLS, OTIS</td>\n",
" <td>1967</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>2007-04-02</td>\n",
" <td>6.0</td>\n",
" <td>PO</td>\n",
" <td>5385.0</td>\n",
" <td>05D-NO ARREST</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1038595</td>\n",
" <td>MCCLAY, CHARLES</td>\n",
" <td>1983</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>2007-10-29</td>\n",
" <td>3.0</td>\n",
" <td>PO</td>\n",
" <td>4735.0</td>\n",
" <td>05D-NO ARREST</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number: Accused: Birth Yr: Gender: Race Code: Date of Appt: \\\n",
"1 107901 GONZALES, ROBIN 1981 F S 2008-04-28 \n",
"4 108026 BECKER, JOHN 1970 M WHI 2000-01-24 \n",
"5 108026 MARKHAM, SEAN 1971 M I 2000-06-19 \n",
"8 1038595 WELLS, OTIS 1967 M BLK 2007-04-02 \n",
"9 1038595 MCCLAY, CHARLES 1983 M BLK 2007-10-29 \n",
"\n",
" Current Unit: Current Rank: Star: Complaint Category \\\n",
"1 18.0 PO 5137.0 NaN \n",
"4 17.0 PO 4734.0 NaN \n",
"5 17.0 PO 19054.0 NaN \n",
"8 6.0 PO 5385.0 05D-NO ARREST \n",
"9 3.0 PO 4735.0 05D-NO ARREST \n",
"\n",
" Finding &\\nRecommended Discipline Unnamed: 11 Final Finding & \\nDiscipline \\\n",
"1 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"5 NaN NaN NaN \n",
"8 EX 600.0 EX \n",
"9 EX 600.0 EX \n",
"\n",
" Unnamed: 13 \n",
"1 NaN \n",
"4 NaN \n",
"5 NaN \n",
"8 600.0 \n",
"9 600.0 "
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subset = df.columns.tolist()[1:]\n",
"df.dropna(subset=subset, how='all', inplace=True)\n",
"df = df[df['Birth Yr:'] != 'end of record']\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Name</th>\n",
" <th>Birth_Yr</th>\n",
" <th>Gender</th>\n",
" <th>Race_Code</th>\n",
" <th>Date_of_Appt</th>\n",
" <th>Current_Unit</th>\n",
" <th>Current_Rank</th>\n",
" <th>Star</th>\n",
" <th>Complaint_Category</th>\n",
" <th>Finding</th>\n",
" <th>Recommended_Discipline</th>\n",
" <th>Final_Finding</th>\n",
" <th>Discipline</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>107901</td>\n",
" <td>GONZALES, ROBIN</td>\n",
" <td>1981</td>\n",
" <td>F</td>\n",
" <td>S</td>\n",
" <td>2008-04-28</td>\n",
" <td>18.0</td>\n",
" <td>PO</td>\n",
" <td>5137.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>108026</td>\n",
" <td>BECKER, JOHN</td>\n",
" <td>1970</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>2000-01-24</td>\n",
" <td>17.0</td>\n",
" <td>PO</td>\n",
" <td>4734.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>108026</td>\n",
" <td>MARKHAM, SEAN</td>\n",
" <td>1971</td>\n",
" <td>M</td>\n",
" <td>I</td>\n",
" <td>2000-06-19</td>\n",
" <td>17.0</td>\n",
" <td>PO</td>\n",
" <td>19054.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1038595</td>\n",
" <td>WELLS, OTIS</td>\n",
" <td>1967</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>2007-04-02</td>\n",
" <td>6.0</td>\n",
" <td>PO</td>\n",
" <td>5385.0</td>\n",
" <td>05D-NO ARREST</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1038595</td>\n",
" <td>MCCLAY, CHARLES</td>\n",
" <td>1983</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>2007-10-29</td>\n",
" <td>3.0</td>\n",
" <td>PO</td>\n",
" <td>4735.0</td>\n",
" <td>05D-NO ARREST</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" <td>EX</td>\n",
" <td>600.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Name Birth_Yr Gender Race_Code Date_of_Appt \\\n",
"1 107901 GONZALES, ROBIN 1981 F S 2008-04-28 \n",
"4 108026 BECKER, JOHN 1970 M WHI 2000-01-24 \n",
"5 108026 MARKHAM, SEAN 1971 M I 2000-06-19 \n",
"8 1038595 WELLS, OTIS 1967 M BLK 2007-04-02 \n",
"9 1038595 MCCLAY, CHARLES 1983 M BLK 2007-10-29 \n",
"\n",
" Current_Unit Current_Rank Star Complaint_Category Finding \\\n",
"1 18.0 PO 5137.0 NaN NaN \n",
"4 17.0 PO 4734.0 NaN NaN \n",
"5 17.0 PO 19054.0 NaN NaN \n",
"8 6.0 PO 5385.0 05D-NO ARREST EX \n",
"9 3.0 PO 4735.0 05D-NO ARREST EX \n",
"\n",
" Recommended_Discipline Final_Finding Discipline \n",
"1 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"5 NaN NaN NaN \n",
"8 600.0 EX 600.0 \n",
"9 600.0 EX 600.0 "
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns = [col.strip().replace(':', '').replace(' ', '_') for col in df.columns]\n",
"df.rename(columns={'Number': 'Complaint_Number',\n",
" 'Accused': 'Name',\n",
" 'Finding_&\\nRecommended_Discipline': 'Finding',\n",
" 'Unnamed_11': 'Recommended_Discipline',\n",
" 'Final_Finding_&_\\nDiscipline': 'Final_Finding',\n",
" 'Unnamed_13': 'Discipline'}, inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"report2 = df.copy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### P046957 - report 3 - police officer witness data xi.xlsx"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Gender</th>\n",
" <th>Race</th>\n",
" <th>Star</th>\n",
" <th>Birth Year</th>\n",
" <th>Date Appointed</th>\n",
" <th>Unnamed: 6</th>\n",
" <th>Unnamed: 7</th>\n",
" <th>Unnamed: 8</th>\n",
" <th>Unnamed: 9</th>\n",
" <th>Unnamed: 10</th>\n",
" <th>Unnamed: 11</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Number:</td>\n",
" <td>1053502</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>HARRIS, KAL</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>14236</td>\n",
" <td>1974.0</td>\n",
" <td>2001-05-29</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Number:</td>\n",
" <td>1053509</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>BUKOWSKIBUS, GEORGE</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>11982</td>\n",
" <td>1966.0</td>\n",
" <td>1996-12-02</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>HEINICHEN, WALTER</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>410</td>\n",
" <td>1968.0</td>\n",
" <td>1996-12-02</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>LAMEKA, MARGARET</td>\n",
" <td>F</td>\n",
" <td>WHI</td>\n",
" <td>NaN</td>\n",
" <td>1958.0</td>\n",
" <td>1986-06-16</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Number:</td>\n",
" <td>1053545</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>SCALES, MONICA</td>\n",
" <td>F</td>\n",
" <td>BLK</td>\n",
" <td>15769</td>\n",
" <td>1968.0</td>\n",
" <td>2006-09-25</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Number:</td>\n",
" <td>1053546</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>JANKOWSKI, JASON</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>19010</td>\n",
" <td>1967.0</td>\n",
" <td>1995-02-06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Number:</td>\n",
" <td>1053584</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>ADAMS, HELEN</td>\n",
" <td>F</td>\n",
" <td>BLK</td>\n",
" <td>NaN</td>\n",
" <td>1960.0</td>\n",
" <td>1990-08-27</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>BROWNRIDGE, JOHN</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>2874</td>\n",
" <td>1957.0</td>\n",
" <td>1986-06-16</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>FULTON, EDWARD</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>12724</td>\n",
" <td>1965.0</td>\n",
" <td>1990-03-26</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>KIDD, THOMAS</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>17196</td>\n",
" <td>1966.0</td>\n",
" <td>1998-06-29</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Number:</td>\n",
" <td>1053591</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>ANTONIAZZI, ALBERT</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>NaN</td>\n",
" <td>1952.0</td>\n",
" <td>1987-12-07</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>Number:</td>\n",
" <td>1053637</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>CAVANAUGH, RONALD</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>3274</td>\n",
" <td>1970.0</td>\n",
" <td>1996-03-18</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>STOYIAS, THOMAS</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>8893</td>\n",
" <td>1980.0</td>\n",
" <td>2002-12-02</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>Number:</td>\n",
" <td>1053644</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>OMMUNDSON, STEVEN</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>13862</td>\n",
" <td>1984.0</td>\n",
" <td>2007-07-09</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6344</th>\n",
" <td>Number:</td>\n",
" <td>1080023</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6345</th>\n",
" <td>CASTILLO, VERONICA</td>\n",
" <td>F</td>\n",
" <td>S</td>\n",
" <td>17807</td>\n",
" <td>1965.0</td>\n",
" <td>2005-01-03</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6346</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6347</th>\n",
" <td>Number:</td>\n",
" <td>1080030</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6348</th>\n",
" <td>BANKS, CHRISTOPH</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>17375</td>\n",
" <td>1972.0</td>\n",
" <td>1998-05-26</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6349</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6350</th>\n",
" <td>Number:</td>\n",
" <td>1080089</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6351</th>\n",
" <td>WHITE, JOSEPH</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>2058</td>\n",
" <td>1958.0</td>\n",
" <td>2003-08-25</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6352</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6353</th>\n",
" <td>Number:</td>\n",
" <td>1080111</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6354</th>\n",
" <td>CLARKE, TASHA</td>\n",
" <td>F</td>\n",
" <td>BLK</td>\n",
" <td>14871</td>\n",
" <td>1977.0</td>\n",
" <td>2006-05-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6355</th>\n",
" <td>RAEHL, LAWRENCE</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>8754</td>\n",
" <td>1968.0</td>\n",
" <td>1994-07-05</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6356</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6357</th>\n",
" <td>Number:</td>\n",
" <td>1080185</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6358</th>\n",
" <td>LARSON, ROBERT</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>1119</td>\n",
" <td>1962.0</td>\n",
" <td>1992-12-07</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6359</th>\n",
" <td>MEDICI, MICHAEL</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>13023</td>\n",
" <td>1975.0</td>\n",
" <td>1999-05-10</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6360</th>\n",
" <td>QUINTERO, ROBERT</td>\n",
" <td>M</td>\n",
" <td>S</td>\n",
" <td>17017</td>\n",
" <td>1970.0</td>\n",
" <td>1991-12-16</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6361</th>\n",
" <td>SCHUR, ADAM</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>19163</td>\n",
" <td>1982.0</td>\n",
" <td>2008-04-28</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6362</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6363</th>\n",
" <td>Number:</td>\n",
" <td>1080192</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6364</th>\n",
" <td>ESTRADA, FRED</td>\n",
" <td>M</td>\n",
" <td>S</td>\n",
" <td>4846</td>\n",
" <td>1971.0</td>\n",
" <td>2006-12-18</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6365</th>\n",
" <td>MARKHAM, KEVIN</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>9284</td>\n",
" <td>1970.0</td>\n",
" <td>1995-09-05</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6366</th>\n",
" <td>MUNOZ JR, CESAR</td>\n",
" <td>M</td>\n",
" <td>S</td>\n",
" <td>7013</td>\n",
" <td>1976.0</td>\n",
" <td>2006-10-30</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6367</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6368</th>\n",
" <td>Number:</td>\n",
" <td>1080497</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6369</th>\n",
" <td>POCIUS, NICHOLAS</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>11106</td>\n",
" <td>1982.0</td>\n",
" <td>2006-07-31</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6370</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6371</th>\n",
" <td>Number:</td>\n",
" <td>1080568</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6372</th>\n",
" <td>DOTSON, ARTREUNA</td>\n",
" <td>F</td>\n",
" <td>BLK</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6373</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6374 rows × 12 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Gender Race Star Birth Year \\\n",
"0 Number: 1053502 NaN NaN NaN \n",
"1 HARRIS, KAL M BLK 14236 1974.0 \n",
"2 NaN NaN end of record NaN NaN \n",
"3 Number: 1053509 NaN NaN NaN \n",
"4 BUKOWSKIBUS, GEORGE M WHI 11982 1966.0 \n",
"5 HEINICHEN, WALTER M WHI 410 1968.0 \n",
"6 LAMEKA, MARGARET F WHI NaN 1958.0 \n",
"7 NaN NaN end of record NaN NaN \n",
"8 Number: 1053545 NaN NaN NaN \n",
"9 SCALES, MONICA F BLK 15769 1968.0 \n",
"10 NaN NaN end of record NaN NaN \n",
"11 Number: 1053546 NaN NaN NaN \n",
"12 JANKOWSKI, JASON M WHI 19010 1967.0 \n",
"13 NaN NaN end of record NaN NaN \n",
"14 Number: 1053584 NaN NaN NaN \n",
"15 ADAMS, HELEN F BLK NaN 1960.0 \n",
"16 BROWNRIDGE, JOHN M BLK 2874 1957.0 \n",
"17 FULTON, EDWARD M BLK 12724 1965.0 \n",
"18 KIDD, THOMAS M BLK 17196 1966.0 \n",
"19 NaN NaN end of record NaN NaN \n",
"20 Number: 1053591 NaN NaN NaN \n",
"21 ANTONIAZZI, ALBERT M WHI NaN 1952.0 \n",
"22 NaN NaN end of record NaN NaN \n",
"23 Number: 1053637 NaN NaN NaN \n",
"24 CAVANAUGH, RONALD M WHI 3274 1970.0 \n",
"25 STOYIAS, THOMAS M WHI 8893 1980.0 \n",
"26 NaN NaN end of record NaN NaN \n",
"27 Number: 1053644 NaN NaN NaN \n",
"28 OMMUNDSON, STEVEN M WHI 13862 1984.0 \n",
"29 NaN NaN end of record NaN NaN \n",
"... ... ... ... ... ... \n",
"6344 Number: 1080023 NaN NaN NaN \n",
"6345 CASTILLO, VERONICA F S 17807 1965.0 \n",
"6346 NaN NaN end of record NaN NaN \n",
"6347 Number: 1080030 NaN NaN NaN \n",
"6348 BANKS, CHRISTOPH M WHI 17375 1972.0 \n",
"6349 NaN NaN end of record NaN NaN \n",
"6350 Number: 1080089 NaN NaN NaN \n",
"6351 WHITE, JOSEPH M WHI 2058 1958.0 \n",
"6352 NaN NaN end of record NaN NaN \n",
"6353 Number: 1080111 NaN NaN NaN \n",
"6354 CLARKE, TASHA F BLK 14871 1977.0 \n",
"6355 RAEHL, LAWRENCE M WHI 8754 1968.0 \n",
"6356 NaN NaN end of record NaN NaN \n",
"6357 Number: 1080185 NaN NaN NaN \n",
"6358 LARSON, ROBERT M WHI 1119 1962.0 \n",
"6359 MEDICI, MICHAEL M WHI 13023 1975.0 \n",
"6360 QUINTERO, ROBERT M S 17017 1970.0 \n",
"6361 SCHUR, ADAM M WHI 19163 1982.0 \n",
"6362 NaN NaN end of record NaN NaN \n",
"6363 Number: 1080192 NaN NaN NaN \n",
"6364 ESTRADA, FRED M S 4846 1971.0 \n",
"6365 MARKHAM, KEVIN M WHI 9284 1970.0 \n",
"6366 MUNOZ JR, CESAR M S 7013 1976.0 \n",
"6367 NaN NaN end of record NaN NaN \n",
"6368 Number: 1080497 NaN NaN NaN \n",
"6369 POCIUS, NICHOLAS M WHI 11106 1982.0 \n",
"6370 NaN NaN end of record NaN NaN \n",
"6371 Number: 1080568 NaN NaN NaN \n",
"6372 DOTSON, ARTREUNA F BLK NaN NaN \n",
"6373 NaN NaN end of record NaN NaN \n",
"\n",
" Date Appointed Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 \\\n",
"0 NaT NaN NaN NaN NaN \n",
"1 2001-05-29 NaN NaN NaN NaN \n",
"2 NaT NaN NaN NaN NaN \n",
"3 NaT NaN NaN NaN NaN \n",
"4 1996-12-02 NaN NaN NaN NaN \n",
"5 1996-12-02 NaN NaN NaN NaN \n",
"6 1986-06-16 NaN NaN NaN NaN \n",
"7 NaT NaN NaN NaN NaN \n",
"8 NaT NaN NaN NaN NaN \n",
"9 2006-09-25 NaN NaN NaN NaN \n",
"10 NaT NaN NaN NaN NaN \n",
"11 NaT NaN NaN NaN NaN \n",
"12 1995-02-06 NaN NaN NaN NaN \n",
"13 NaT NaN NaN NaN NaN \n",
"14 NaT NaN NaN NaN NaN \n",
"15 1990-08-27 NaN NaN NaN NaN \n",
"16 1986-06-16 NaN NaN NaN NaN \n",
"17 1990-03-26 NaN NaN NaN NaN \n",
"18 1998-06-29 NaN NaN NaN NaN \n",
"19 NaT NaN NaN NaN NaN \n",
"20 NaT NaN NaN NaN NaN \n",
"21 1987-12-07 NaN NaN NaN NaN \n",
"22 NaT NaN NaN NaN NaN \n",
"23 NaT NaN NaN NaN NaN \n",
"24 1996-03-18 NaN NaN NaN NaN \n",
"25 2002-12-02 NaN NaN NaN NaN \n",
"26 NaT NaN NaN NaN NaN \n",
"27 NaT NaN NaN NaN NaN \n",
"28 2007-07-09 NaN NaN NaN NaN \n",
"29 NaT NaN NaN NaN NaN \n",
"... ... ... ... ... ... \n",
"6344 NaT NaN NaN NaN NaN \n",
"6345 2005-01-03 NaN NaN NaN NaN \n",
"6346 NaT NaN NaN NaN NaN \n",
"6347 NaT NaN NaN NaN NaN \n",
"6348 1998-05-26 NaN NaN NaN NaN \n",
"6349 NaT NaN NaN NaN NaN \n",
"6350 NaT NaN NaN NaN NaN \n",
"6351 2003-08-25 NaN NaN NaN NaN \n",
"6352 NaT NaN NaN NaN NaN \n",
"6353 NaT NaN NaN NaN NaN \n",
"6354 2006-05-01 NaN NaN NaN NaN \n",
"6355 1994-07-05 NaN NaN NaN NaN \n",
"6356 NaT NaN NaN NaN NaN \n",
"6357 NaT NaN NaN NaN NaN \n",
"6358 1992-12-07 NaN NaN NaN NaN \n",
"6359 1999-05-10 NaN NaN NaN NaN \n",
"6360 1991-12-16 NaN NaN NaN NaN \n",
"6361 2008-04-28 NaN NaN NaN NaN \n",
"6362 NaT NaN NaN NaN NaN \n",
"6363 NaT NaN NaN NaN NaN \n",
"6364 2006-12-18 NaN NaN NaN NaN \n",
"6365 1995-09-05 NaN NaN NaN NaN \n",
"6366 2006-10-30 NaN NaN NaN NaN \n",
"6367 NaT NaN NaN NaN NaN \n",
"6368 NaT NaN NaN NaN NaN \n",
"6369 2006-07-31 NaN NaN NaN NaN \n",
"6370 NaT NaN NaN NaN NaN \n",
"6371 NaT NaN NaN NaN NaN \n",
"6372 NaT NaN NaN NaN NaN \n",
"6373 NaT NaN NaN NaN NaN \n",
"\n",
" Unnamed: 10 Unnamed: 11 \n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n",
"5 NaN NaN \n",
"6 NaN NaN \n",
"7 NaN NaN \n",
"8 NaN NaN \n",
"9 NaN NaN \n",
"10 NaN NaN \n",
"11 NaN NaN \n",
"12 NaN NaN \n",
"13 NaN NaN \n",
"14 NaN NaN \n",
"15 NaN NaN \n",
"16 NaN NaN \n",
"17 NaN NaN \n",
"18 NaN NaN \n",
"19 NaN NaN \n",
"20 NaN NaN \n",
"21 NaN NaN \n",
"22 NaN NaN \n",
"23 NaN NaN \n",
"24 NaN NaN \n",
"25 NaN NaN \n",
"26 NaN NaN \n",
"27 NaN NaN \n",
"28 NaN NaN \n",
"29 NaN NaN \n",
"... ... ... \n",
"6344 NaN NaN \n",
"6345 NaN NaN \n",
"6346 NaN NaN \n",
"6347 NaN NaN \n",
"6348 NaN NaN \n",
"6349 NaN NaN \n",
"6350 NaN NaN \n",
"6351 NaN NaN \n",
"6352 NaN NaN \n",
"6353 NaN NaN \n",
"6354 NaN NaN \n",
"6355 NaN NaN \n",
"6356 NaN NaN \n",
"6357 NaN NaN \n",
"6358 NaN NaN \n",
"6359 NaN NaN \n",
"6360 NaN NaN \n",
"6361 NaN NaN \n",
"6362 NaN NaN \n",
"6363 NaN NaN \n",
"6364 NaN NaN \n",
"6365 NaN NaN \n",
"6366 NaN NaN \n",
"6367 NaN NaN \n",
"6368 NaN NaN \n",
"6369 NaN NaN \n",
"6370 NaN NaN \n",
"6371 NaN NaN \n",
"6372 NaN NaN \n",
"6373 NaN NaN \n",
"\n",
"[6374 rows x 12 columns]"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_excel(files[2], skiprows=9)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df['Complaint_Number'] = df[df['Unnamed: 0']=='Number:']['Gender']"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Gender</th>\n",
" <th>Race</th>\n",
" <th>Star</th>\n",
" <th>Birth Year</th>\n",
" <th>Date Appointed</th>\n",
" <th>Unnamed: 6</th>\n",
" <th>Unnamed: 7</th>\n",
" <th>Unnamed: 8</th>\n",
" <th>Unnamed: 9</th>\n",
" <th>Unnamed: 10</th>\n",
" <th>Unnamed: 11</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1053502</td>\n",
" <td>Number:</td>\n",
" <td>1053502</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>HARRIS, KAL</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>14236</td>\n",
" <td>1974.0</td>\n",
" <td>2001-05-29</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1053509</td>\n",
" <td>Number:</td>\n",
" <td>1053509</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NaN</td>\n",
" <td>BUKOWSKIBUS, GEORGE</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>11982</td>\n",
" <td>1966.0</td>\n",
" <td>1996-12-02</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Unnamed: 0 Gender Race Star \\\n",
"0 1053502 Number: 1053502 NaN NaN \n",
"1 NaN HARRIS, KAL M BLK 14236 \n",
"2 NaN NaN NaN end of record NaN \n",
"3 1053509 Number: 1053509 NaN NaN \n",
"4 NaN BUKOWSKIBUS, GEORGE M WHI 11982 \n",
"\n",
" Birth Year Date Appointed Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 \\\n",
"0 NaN NaT NaN NaN NaN NaN \n",
"1 1974.0 2001-05-29 NaN NaN NaN NaN \n",
"2 NaN NaT NaN NaN NaN NaN \n",
"3 NaN NaT NaN NaN NaN NaN \n",
"4 1966.0 1996-12-02 NaN NaN NaN NaN \n",
"\n",
" Unnamed: 10 Unnamed: 11 \n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN "
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cols = ['Complaint_Number'] + [col for col in df.columns if col != 'Complaint_Number']\n",
"df = df[cols].copy()\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Gender</th>\n",
" <th>Race</th>\n",
" <th>Star</th>\n",
" <th>Birth Year</th>\n",
" <th>Date Appointed</th>\n",
" <th>Unnamed: 6</th>\n",
" <th>Unnamed: 7</th>\n",
" <th>Unnamed: 8</th>\n",
" <th>Unnamed: 9</th>\n",
" <th>Unnamed: 10</th>\n",
" <th>Unnamed: 11</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1053502</td>\n",
" <td>HARRIS, KAL</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>14236</td>\n",
" <td>1974.0</td>\n",
" <td>2001-05-29</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1053509</td>\n",
" <td>BUKOWSKIBUS, GEORGE</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>11982</td>\n",
" <td>1966.0</td>\n",
" <td>1996-12-02</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1053509</td>\n",
" <td>HEINICHEN, WALTER</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>410</td>\n",
" <td>1968.0</td>\n",
" <td>1996-12-02</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1053509</td>\n",
" <td>LAMEKA, MARGARET</td>\n",
" <td>F</td>\n",
" <td>WHI</td>\n",
" <td>NaN</td>\n",
" <td>1958.0</td>\n",
" <td>1986-06-16</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1053545</td>\n",
" <td>SCALES, MONICA</td>\n",
" <td>F</td>\n",
" <td>BLK</td>\n",
" <td>15769</td>\n",
" <td>1968.0</td>\n",
" <td>2006-09-25</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Unnamed: 0 Gender Race Star Birth Year \\\n",
"1 1053502 HARRIS, KAL M BLK 14236 1974.0 \n",
"4 1053509 BUKOWSKIBUS, GEORGE M WHI 11982 1966.0 \n",
"5 1053509 HEINICHEN, WALTER M WHI 410 1968.0 \n",
"6 1053509 LAMEKA, MARGARET F WHI NaN 1958.0 \n",
"9 1053545 SCALES, MONICA F BLK 15769 1968.0 \n",
"\n",
" Date Appointed Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 \\\n",
"1 2001-05-29 NaN NaN NaN NaN NaN \n",
"4 1996-12-02 NaN NaN NaN NaN NaN \n",
"5 1996-12-02 NaN NaN NaN NaN NaN \n",
"6 1986-06-16 NaN NaN NaN NaN NaN \n",
"9 2006-09-25 NaN NaN NaN NaN NaN \n",
"\n",
" Unnamed: 11 \n",
"1 NaN \n",
"4 NaN \n",
"5 NaN \n",
"6 NaN \n",
"9 NaN "
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.Complaint_Number.fillna(method='ffill', inplace=True)\n",
"df = df[(df['Unnamed: 0'] != 'Number:') & (df['Race'] != 'end of record')]\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 3432 entries, 1 to 6372\n",
"Data columns (total 13 columns):\n",
"Complaint_Number 3432 non-null object\n",
"Unnamed: 0 3432 non-null object\n",
"Gender 3432 non-null object\n",
"Race 3432 non-null object\n",
"Star 3220 non-null object\n",
"Birth Year 3415 non-null float64\n",
"Date Appointed 3415 non-null datetime64[ns]\n",
"Unnamed: 6 0 non-null float64\n",
"Unnamed: 7 0 non-null float64\n",
"Unnamed: 8 0 non-null float64\n",
"Unnamed: 9 0 non-null float64\n",
"Unnamed: 10 0 non-null float64\n",
"Unnamed: 11 0 non-null float64\n",
"dtypes: datetime64[ns](1), float64(7), object(5)\n",
"memory usage: 375.4+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.drop(['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', \n",
" 'Unnamed: 10', 'Unnamed: 11'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 3432 entries, 1 to 6372\n",
"Data columns (total 7 columns):\n",
"Complaint_Number 3432 non-null object\n",
"Unnamed: 0 3432 non-null object\n",
"Gender 3432 non-null object\n",
"Race 3432 non-null object\n",
"Star 3220 non-null object\n",
"Birth Year 3415 non-null float64\n",
"Date Appointed 3415 non-null datetime64[ns]\n",
"dtypes: datetime64[ns](1), float64(1), object(5)\n",
"memory usage: 214.5+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Name</th>\n",
" <th>Gender</th>\n",
" <th>Race</th>\n",
" <th>Star</th>\n",
" <th>Birth Year</th>\n",
" <th>Date Appointed</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1053502</td>\n",
" <td>HARRIS, KAL</td>\n",
" <td>M</td>\n",
" <td>BLK</td>\n",
" <td>14236</td>\n",
" <td>1974.0</td>\n",
" <td>2001-05-29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1053509</td>\n",
" <td>BUKOWSKIBUS, GEORGE</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>11982</td>\n",
" <td>1966.0</td>\n",
" <td>1996-12-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1053509</td>\n",
" <td>HEINICHEN, WALTER</td>\n",
" <td>M</td>\n",
" <td>WHI</td>\n",
" <td>410</td>\n",
" <td>1968.0</td>\n",
" <td>1996-12-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1053509</td>\n",
" <td>LAMEKA, MARGARET</td>\n",
" <td>F</td>\n",
" <td>WHI</td>\n",
" <td>NaN</td>\n",
" <td>1958.0</td>\n",
" <td>1986-06-16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1053545</td>\n",
" <td>SCALES, MONICA</td>\n",
" <td>F</td>\n",
" <td>BLK</td>\n",
" <td>15769</td>\n",
" <td>1968.0</td>\n",
" <td>2006-09-25</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Name Gender Race Star Birth Year \\\n",
"1 1053502 HARRIS, KAL M BLK 14236 1974.0 \n",
"4 1053509 BUKOWSKIBUS, GEORGE M WHI 11982 1966.0 \n",
"5 1053509 HEINICHEN, WALTER M WHI 410 1968.0 \n",
"6 1053509 LAMEKA, MARGARET F WHI NaN 1958.0 \n",
"9 1053545 SCALES, MONICA F BLK 15769 1968.0 \n",
"\n",
" Date Appointed \n",
"1 2001-05-29 \n",
"4 1996-12-02 \n",
"5 1996-12-02 \n",
"6 1986-06-16 \n",
"9 2006-09-25 "
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rename(columns={'Unnamed: 0': 'Name'}, inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"report3 = df.copy()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### P046957 - report 4 - victim data.xlsx"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Unnamed: 1</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Unnamed: 4</th>\n",
" <th>Race Desc</th>\n",
" <th>Unnamed: 6</th>\n",
" <th>Unnamed: 7</th>\n",
" <th>Unnamed: 8</th>\n",
" <th>Unnamed: 9</th>\n",
" <th>Unnamed: 10</th>\n",
" <th>Unnamed: 11</th>\n",
" <th>Unnamed: 12</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1039179.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>F</td>\n",
" <td>25.832877</td>\n",
" <td>NaN</td>\n",
" <td>BLACK</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>M</td>\n",
" <td>38.000000</td>\n",
" <td>NaN</td>\n",
" <td>BLACK</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1053502.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Unnamed: 1 Gender Age Unnamed: 4 Race Desc \\\n",
"0 1039179.0 NaN NaN NaN NaN NaN \n",
"1 NaN NaN F 25.832877 NaN BLACK \n",
"2 NaN NaN M 38.000000 NaN BLACK \n",
"3 NaN NaN NaN NaN NaN end of record \n",
"4 1053502.0 NaN NaN NaN NaN NaN \n",
"\n",
" Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 \\\n",
"0 NaN NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN NaN NaN \n",
"\n",
" Unnamed: 12 \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_excel(files[3], skiprows=7)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 19821 entries, 0 to 19820\n",
"Data columns (total 13 columns):\n",
"Number 6271 non-null float64\n",
"Unnamed: 1 0 non-null float64\n",
"Gender 7279 non-null object\n",
"Age 5889 non-null float64\n",
"Unnamed: 4 0 non-null float64\n",
"Race Desc 13550 non-null object\n",
"Unnamed: 6 0 non-null float64\n",
"Unnamed: 7 0 non-null float64\n",
"Unnamed: 8 0 non-null float64\n",
"Unnamed: 9 0 non-null float64\n",
"Unnamed: 10 0 non-null float64\n",
"Unnamed: 11 0 non-null float64\n",
"Unnamed: 12 0 non-null float64\n",
"dtypes: float64(11), object(2)\n",
"memory usage: 2.0+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1039179.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>F</td>\n",
" <td>25.832877</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>M</td>\n",
" <td>38.000000</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1053502.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"0 1039179.0 NaN NaN NaN\n",
"1 NaN F 25.832877 BLACK\n",
"2 NaN M 38.000000 BLACK\n",
"3 NaN NaN NaN end of record\n",
"4 1053502.0 NaN NaN NaN"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dropna(axis=1, how='all', inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1039179</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>25.832877</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>38.000000</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1039179</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1053502</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"0 1039179 NaN NaN NaN\n",
"1 1039179 F 25.832877 BLACK\n",
"2 1039179 M 38.000000 BLACK\n",
"3 1039179 NaN NaN end of record\n",
"4 1053502 NaN NaN NaN"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Number'].fillna(method='ffill', inplace=True)\n",
"df['Number'] = df['Number'].astype(int)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>25.832877</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>38.000000</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1039179</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1053502</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1053502</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"1 1039179 F 25.832877 BLACK\n",
"2 1039179 M 38.000000 BLACK\n",
"3 1039179 NaN NaN end of record\n",
"5 1053502 F NaN BLACK\n",
"6 1053502 NaN NaN end of record"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subset = df.columns.tolist()[1:]\n",
"df.dropna(subset=subset, how='all', inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>25.832877</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>38.000000</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1053502</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1053505</td>\n",
" <td>F</td>\n",
" <td>18.378082</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>1053509</td>\n",
" <td>F</td>\n",
" <td>42.747945</td>\n",
" <td>WHITE HISPANIC</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"1 1039179 F 25.832877 BLACK\n",
"2 1039179 M 38.000000 BLACK\n",
"5 1053502 F NaN BLACK\n",
"8 1053505 F 18.378082 BLACK\n",
"11 1053509 F 42.747945 WHITE HISPANIC"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df[df['Race Desc'] != 'end of record']\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race_Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>25.832877</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>38.000000</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1053502</td>\n",
" <td>F</td>\n",
" <td>NaN</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1053505</td>\n",
" <td>F</td>\n",
" <td>18.378082</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>1053509</td>\n",
" <td>F</td>\n",
" <td>42.747945</td>\n",
" <td>WHITE HISPANIC</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Gender Age Race_Desc\n",
"1 1039179 F 25.832877 BLACK\n",
"2 1039179 M 38.000000 BLACK\n",
"5 1053502 F NaN BLACK\n",
"8 1053505 F 18.378082 BLACK\n",
"11 1053509 F 42.747945 WHITE HISPANIC"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rename(columns={'Number': 'Complaint_Number', 'Race Desc': 'Race_Desc'}, inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"report4 = df.copy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### P046957 - report 5 - complainant (reporting party) data"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Unnamed: 1</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Unnamed: 4</th>\n",
" <th>Race Desc</th>\n",
" <th>Unnamed: 6</th>\n",
" <th>Unnamed: 7</th>\n",
" <th>Unnamed: 8</th>\n",
" <th>Unnamed: 9</th>\n",
" <th>Unnamed: 10</th>\n",
" <th>Unnamed: 11</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1038595.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>F</td>\n",
" <td>63.805479</td>\n",
" <td>NaN</td>\n",
" <td>WHITE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1039179.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>M</td>\n",
" <td>50.449315</td>\n",
" <td>NaN</td>\n",
" <td>BLACK</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Unnamed: 1 Gender Age Unnamed: 4 Race Desc \\\n",
"0 1038595.0 NaN NaN NaN NaN NaN \n",
"1 NaN NaN F 63.805479 NaN WHITE \n",
"2 NaN NaN NaN NaN NaN end of record \n",
"3 1039179.0 NaN NaN NaN NaN NaN \n",
"4 NaN NaN M 50.449315 NaN BLACK \n",
"\n",
" Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 \n",
"0 NaN NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN NaN NaN "
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_excel(files[4], skiprows=7)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 39917 entries, 0 to 39916\n",
"Data columns (total 12 columns):\n",
"Number 13238 non-null float64\n",
"Unnamed: 1 0 non-null float64\n",
"Gender 13441 non-null object\n",
"Age 11675 non-null float64\n",
"Unnamed: 4 0 non-null float64\n",
"Race Desc 26679 non-null object\n",
"Unnamed: 6 0 non-null float64\n",
"Unnamed: 7 0 non-null float64\n",
"Unnamed: 8 0 non-null float64\n",
"Unnamed: 9 0 non-null float64\n",
"Unnamed: 10 0 non-null float64\n",
"Unnamed: 11 0 non-null float64\n",
"dtypes: float64(10), object(2)\n",
"memory usage: 3.7+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1038595.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>F</td>\n",
" <td>63.805479</td>\n",
" <td>WHITE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>end of record</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1039179.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NaN</td>\n",
" <td>M</td>\n",
" <td>50.449315</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"0 1038595.0 NaN NaN NaN\n",
"1 NaN F 63.805479 WHITE\n",
"2 NaN NaN NaN end of record\n",
"3 1039179.0 NaN NaN NaN\n",
"4 NaN M 50.449315 BLACK"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dropna(axis=1, how='all', inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1038595</td>\n",
" <td>F</td>\n",
" <td>63.805479</td>\n",
" <td>WHITE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>50.449315</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>51.391781</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>34.641096</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1053492</td>\n",
" <td>F</td>\n",
" <td>51.364384</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"1 1038595 F 63.805479 WHITE\n",
"4 1039179 M 50.449315 BLACK\n",
"5 1039179 M 51.391781 BLACK\n",
"6 1039179 F 34.641096 BLACK\n",
"9 1053492 F 51.364384 BLACK"
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Number'].fillna(method='ffill', inplace=True)\n",
"df['Number'] = df['Number'].astype(int)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1038595</td>\n",
" <td>F</td>\n",
" <td>63.805479</td>\n",
" <td>WHITE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>50.449315</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>51.391781</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>34.641096</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1053492</td>\n",
" <td>F</td>\n",
" <td>51.364384</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"1 1038595 F 63.805479 WHITE\n",
"4 1039179 M 50.449315 BLACK\n",
"5 1039179 M 51.391781 BLACK\n",
"6 1039179 F 34.641096 BLACK\n",
"9 1053492 F 51.364384 BLACK"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subset = df.columns.tolist()[1:]\n",
"df.dropna(subset=subset, how='all', inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1038595</td>\n",
" <td>F</td>\n",
" <td>63.805479</td>\n",
" <td>WHITE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>50.449315</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>51.391781</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>34.641096</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1053492</td>\n",
" <td>F</td>\n",
" <td>51.364384</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Number Gender Age Race Desc\n",
"1 1038595 F 63.805479 WHITE\n",
"4 1039179 M 50.449315 BLACK\n",
"5 1039179 M 51.391781 BLACK\n",
"6 1039179 F 34.641096 BLACK\n",
"9 1053492 F 51.364384 BLACK"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df[df['Race Desc'] != 'end of record']\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Complaint_Number</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Race_Desc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1038595</td>\n",
" <td>F</td>\n",
" <td>63.805479</td>\n",
" <td>WHITE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>50.449315</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1039179</td>\n",
" <td>M</td>\n",
" <td>51.391781</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1039179</td>\n",
" <td>F</td>\n",
" <td>34.641096</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1053492</td>\n",
" <td>F</td>\n",
" <td>51.364384</td>\n",
" <td>BLACK</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Complaint_Number Gender Age Race_Desc\n",
"1 1038595 F 63.805479 WHITE\n",
"4 1039179 M 50.449315 BLACK\n",
"5 1039179 M 51.391781 BLACK\n",
"6 1039179 F 34.641096 BLACK\n",
"9 1053492 F 51.364384 BLACK"
]
},
"execution_count": 128,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rename(columns={'Number': 'Complaint_Number', 'Race Desc': 'Race_Desc'}, inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"report5 = df.copy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Merging datasets"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Beat', 'Location_Code', 'Address', 'Street',\n",
" 'Apartment', 'City_State_Zipcode', 'Incident_Datetime',\n",
" 'Complaint_Date', 'Closed_Date', 'Investigator_Name',\n",
" 'Investigator_Current_Assignment', 'Investigator_Rank',\n",
" 'Investigator_Star', 'Investigator_Appointed_Date'],\n",
" dtype='object')"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report1.columns"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Name', 'Birth_Yr', 'Gender', 'Race_Code',\n",
" 'Date_of_Appt', 'Current_Unit', 'Current_Rank', 'Star',\n",
" 'Complaint_Category', 'Finding', 'Recommended_Discipline',\n",
" 'Final_Finding', 'Discipline'],\n",
" dtype='object')"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report2.columns"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Name', 'Gender', 'Race', 'Star', 'Birth Year',\n",
" 'Date Appointed'],\n",
" dtype='object')"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report3.columns"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Gender', 'Age', 'Race_Desc'], dtype='object')"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report4.columns"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Gender', 'Age', 'Race_Desc'], dtype='object')"
]
},
"execution_count": 130,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report5.columns"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"report1.rename(columns={'Investigator_Appointed_Date': 'Investigator_Date_Appointed'})\n",
"report2.rename(columns={'Date_of_Appt': 'Date_Appointed'})\n",
"\n",
"cols = ['Accused_' + col.strip().replace(' ', '_') for col in report2.columns[1:]]\n",
"report2.columns = ['Complaint_Number'] + cols\n",
"\n",
"cols = ['PO_Witness_' + col.strip().replace(' ', '_') for col in report3.columns[1:]]\n",
"report3.columns = ['Complaint_Number'] + cols\n",
"\n",
"cols = ['Victim_' + col.strip().replace(' ', '_') for col in report4.columns[1:]]\n",
"report4.columns = ['Complaint_Number'] + cols\n",
"\n",
"cols = ['Complainant_' + col.strip().replace(' ', '_') for col in report5.columns[1:]]\n",
"report5.columns = ['Complaint_Number'] + cols"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Beat', 'Location_Code', 'Address', 'Street',\n",
" 'Apartment', 'City_State_Zipcode', 'Incident_Datetime',\n",
" 'Complaint_Date', 'Closed_Date', 'Investigator_Name',\n",
" 'Investigator_Current_Assignment', 'Investigator_Rank',\n",
" 'Investigator_Star', 'Investigator_Appointed_Date'],\n",
" dtype='object')"
]
},
"execution_count": 132,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report1.columns"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Accused_Accused_Name', 'Accused_Accused_Birth_Yr',\n",
" 'Accused_Accused_Gender', 'Accused_Accused_Race_Code',\n",
" 'Accused_Accused_Date_of_Appt', 'Accused_Accused_Current_Unit',\n",
" 'Accused_Accused_Current_Rank', 'Accused_Accused_Star',\n",
" 'Accused_Accused_Complaint_Category', 'Accused_Accused_Finding',\n",
" 'Accused_Accused_Recommended_Discipline',\n",
" 'Accused_Accused_Final_Finding', 'Accused_Accused_Discipline'],\n",
" dtype='object')"
]
},
"execution_count": 133,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report2.columns"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'PO_Witness_PO_Witness_Name',\n",
" 'PO_Witness_PO_Witness_Gender', 'PO_Witness_PO_Witness_Race',\n",
" 'PO_Witness_PO_Witness_Star', 'PO_Witness_PO_Witness_Birth_Year',\n",
" 'PO_Witness_PO_Witness_Date_Appointed'],\n",
" dtype='object')"
]
},
"execution_count": 134,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report3.columns"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Victim_Victim_Gender', 'Victim_Victim_Age',\n",
" 'Victim_Victim_Race_Desc'],\n",
" dtype='object')"
]
},
"execution_count": 135,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report4.columns"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Complaint_Number', 'Complainant_Gender', 'Complainant_Age',\n",
" 'Complainant_Race_Desc'],\n",
" dtype='object')"
]
},
"execution_count": 136,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"report5.columns"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"merged = pd.merge(report1, report2, on='Complaint_Number', how='outer')\n",
"merged = pd.merge(merged, report3, on='Complaint_Number', how='outer')\n",
"merged = pd.merge(merged, report4, on='Complaint_Number', how='outer')\n",
"merged = pd.merge(merged, report5, on='Complaint_Number', how='outer')"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 31597 entries, 0 to 31596\n",
"Data columns (total 40 columns):\n",
"Complaint_Number 31597 non-null object\n",
"Beat 26935 non-null object\n",
"Location_Code 27149 non-null object\n",
"Address 19684 non-null object\n",
"Street 20110 non-null object\n",
"Apartment 1530 non-null object\n",
"City_State_Zipcode 21396 non-null object\n",
"Incident_Datetime 27151 non-null datetime64[ns]\n",
"Complaint_Date 27151 non-null datetime64[ns]\n",
"Closed_Date 24203 non-null datetime64[ns]\n",
"Investigator_Name 27753 non-null object\n",
"Investigator_Current_Assignment 27753 non-null object\n",
"Investigator_Rank 27753 non-null object\n",
"Investigator_Star 19661 non-null object\n",
"Investigator_Appointed_Date 26836 non-null object\n",
"Accused_Accused_Name 21080 non-null object\n",
"Accused_Accused_Birth_Yr 21080 non-null object\n",
"Accused_Accused_Gender 21080 non-null object\n",
"Accused_Accused_Race_Code 21080 non-null object\n",
"Accused_Accused_Date_of_Appt 21080 non-null datetime64[ns]\n",
"Accused_Accused_Current_Unit 21080 non-null float64\n",
"Accused_Accused_Current_Rank 20965 non-null object\n",
"Accused_Accused_Star 19882 non-null float64\n",
"Accused_Accused_Complaint_Category 19346 non-null object\n",
"Accused_Accused_Finding 7390 non-null object\n",
"Accused_Accused_Recommended_Discipline 17398 non-null float64\n",
"Accused_Accused_Final_Finding 6923 non-null object\n",
"Accused_Accused_Discipline 16912 non-null float64\n",
"PO_Witness_PO_Witness_Name 3432 non-null object\n",
"PO_Witness_PO_Witness_Gender 3432 non-null object\n",
"PO_Witness_PO_Witness_Race 3432 non-null object\n",
"PO_Witness_PO_Witness_Star 3220 non-null object\n",
"PO_Witness_PO_Witness_Birth_Year 3415 non-null float64\n",
"PO_Witness_PO_Witness_Date_Appointed 3415 non-null datetime64[ns]\n",
"Victim_Victim_Gender 12554 non-null object\n",
"Victim_Victim_Age 10041 non-null float64\n",
"Victim_Victim_Race_Desc 12554 non-null object\n",
"Complainant_Gender 22595 non-null object\n",
"Complainant_Age 19068 non-null float64\n",
"Complainant_Race_Desc 22595 non-null object\n",
"dtypes: datetime64[ns](5), float64(7), object(28)\n",
"memory usage: 9.9+ MB\n"
]
}
],
"source": [
"merged.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Saving everything"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"report1.to_csv('./Clean/June2016/report1_all_complaints.csv')\n",
"report2.to_csv('./Clean/June2016/report2_accused.csv')\n",
"report3.to_csv('./Clean/June2016/report3_po_witness.csv')\n",
"report4.to_csv('./Clean/June2016/report4_victim.csv')\n",
"report5.to_csv('./Clean/June2016/report5_complainant.csv')\n",
"merged.to_csv('./Clean/June2016/june2016_all.csv')\n",
"\n",
"report1.to_excel('./Clean/June2016/report1_all_complaints.xlsx')\n",
"report2.to_excel('./Clean/June2016/report2_accused.xlsx')\n",
"report3.to_excel('./Clean/June2016/report3_po_witness.xlsx')\n",
"report4.to_excel('./Clean/June2016/report4_victim.xlsx')\n",
"report5.to_excel('./Clean/June2016/report5_complainant.xlsx')\n",
"merged.to_excel('./Clean/June2016/june2016_all.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment