Skip to content

Instantly share code, notes, and snippets.

@diegotf30
Created September 24, 2019 04:16
Show Gist options
  • Save diegotf30/26e1ef7d17354ca7ddad630bab502932 to your computer and use it in GitHub Desktop.
Save diegotf30/26e1ef7d17354ca7ddad630bab502932 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>EmployeeName</th>\n",
" <th>JobTitle</th>\n",
" <th>BasePay</th>\n",
" <th>OvertimePay</th>\n",
" <th>OtherPay</th>\n",
" <th>Benefits</th>\n",
" <th>TotalPay</th>\n",
" <th>TotalPayBenefits</th>\n",
" <th>Year</th>\n",
" <th>Notes</th>\n",
" <th>Agency</th>\n",
" <th>Status</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>NATHANIEL FORD</td>\n",
" <td>GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY</td>\n",
" <td>167411</td>\n",
" <td>0</td>\n",
" <td>400184</td>\n",
" <td>NaN</td>\n",
" <td>567595.43</td>\n",
" <td>567595.43</td>\n",
" <td>2011</td>\n",
" <td>NaN</td>\n",
" <td>San Francisco</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>GARY JIMENEZ</td>\n",
" <td>CAPTAIN III (POLICE DEPARTMENT)</td>\n",
" <td>155966</td>\n",
" <td>245132</td>\n",
" <td>137811</td>\n",
" <td>NaN</td>\n",
" <td>538909.28</td>\n",
" <td>538909.28</td>\n",
" <td>2011</td>\n",
" <td>NaN</td>\n",
" <td>San Francisco</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>ALBERT PARDINI</td>\n",
" <td>CAPTAIN III (POLICE DEPARTMENT)</td>\n",
" <td>212739</td>\n",
" <td>106088</td>\n",
" <td>16452.6</td>\n",
" <td>NaN</td>\n",
" <td>335279.91</td>\n",
" <td>335279.91</td>\n",
" <td>2011</td>\n",
" <td>NaN</td>\n",
" <td>San Francisco</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>CHRISTOPHER CHONG</td>\n",
" <td>WIRE ROPE CABLE MAINTENANCE MECHANIC</td>\n",
" <td>77916</td>\n",
" <td>56120.7</td>\n",
" <td>198307</td>\n",
" <td>NaN</td>\n",
" <td>332343.61</td>\n",
" <td>332343.61</td>\n",
" <td>2011</td>\n",
" <td>NaN</td>\n",
" <td>San Francisco</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>PATRICK GARDNER</td>\n",
" <td>DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)</td>\n",
" <td>134402</td>\n",
" <td>9737</td>\n",
" <td>182235</td>\n",
" <td>NaN</td>\n",
" <td>326373.19</td>\n",
" <td>326373.19</td>\n",
" <td>2011</td>\n",
" <td>NaN</td>\n",
" <td>San Francisco</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148649</td>\n",
" <td>148650</td>\n",
" <td>Roy I Tillery</td>\n",
" <td>Custodian</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>2014</td>\n",
" <td>NaN</td>\n",
" <td>San Francisco</td>\n",
" <td>PT</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148650</td>\n",
" <td>148651</td>\n",
" <td>Not provided</td>\n",
" <td>Not provided</td>\n",
" <td>Not Provided</td>\n",
" <td>Not Provided</td>\n",
" <td>Not Provided</td>\n",
" <td>Not Provided</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>2014</td>\n",
" <td>NaN</td>\n",
" <td>San Francisco</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148651</td>\n",
" <td>148652</td>\n",
" <td>Not provided</td>\n",
" <td>Not provided</td>\n",
" <td>Not Provided</td>\n",
" <td>Not Provided</td>\n",
" <td>Not Provided</td>\n",
" <td>Not Provided</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>2014</td>\n",
" <td>NaN</td>\n",
" <td>San Francisco</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148652</td>\n",
" <td>148653</td>\n",
" <td>Not provided</td>\n",
" <td>Not provided</td>\n",
" <td>Not Provided</td>\n",
" <td>Not Provided</td>\n",
" <td>Not Provided</td>\n",
" <td>Not Provided</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>2014</td>\n",
" <td>NaN</td>\n",
" <td>San Francisco</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148653</td>\n",
" <td>148654</td>\n",
" <td>Joe Lopez</td>\n",
" <td>Counselor, Log Cabin Ranch</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>-618.13</td>\n",
" <td>0.00</td>\n",
" <td>-618.13</td>\n",
" <td>-618.13</td>\n",
" <td>2014</td>\n",
" <td>NaN</td>\n",
" <td>San Francisco</td>\n",
" <td>PT</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>148654 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" Id EmployeeName \\\n",
"0 1 NATHANIEL FORD \n",
"1 2 GARY JIMENEZ \n",
"2 3 ALBERT PARDINI \n",
"3 4 CHRISTOPHER CHONG \n",
"4 5 PATRICK GARDNER \n",
"... ... ... \n",
"148649 148650 Roy I Tillery \n",
"148650 148651 Not provided \n",
"148651 148652 Not provided \n",
"148652 148653 Not provided \n",
"148653 148654 Joe Lopez \n",
"\n",
" JobTitle BasePay \\\n",
"0 GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY 167411 \n",
"1 CAPTAIN III (POLICE DEPARTMENT) 155966 \n",
"2 CAPTAIN III (POLICE DEPARTMENT) 212739 \n",
"3 WIRE ROPE CABLE MAINTENANCE MECHANIC 77916 \n",
"4 DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT) 134402 \n",
"... ... ... \n",
"148649 Custodian 0.00 \n",
"148650 Not provided Not Provided \n",
"148651 Not provided Not Provided \n",
"148652 Not provided Not Provided \n",
"148653 Counselor, Log Cabin Ranch 0.00 \n",
"\n",
" OvertimePay OtherPay Benefits TotalPay TotalPayBenefits \\\n",
"0 0 400184 NaN 567595.43 567595.43 \n",
"1 245132 137811 NaN 538909.28 538909.28 \n",
"2 106088 16452.6 NaN 335279.91 335279.91 \n",
"3 56120.7 198307 NaN 332343.61 332343.61 \n",
"4 9737 182235 NaN 326373.19 326373.19 \n",
"... ... ... ... ... ... \n",
"148649 0.00 0.00 0.00 0.00 0.00 \n",
"148650 Not Provided Not Provided Not Provided 0.00 0.00 \n",
"148651 Not Provided Not Provided Not Provided 0.00 0.00 \n",
"148652 Not Provided Not Provided Not Provided 0.00 0.00 \n",
"148653 0.00 -618.13 0.00 -618.13 -618.13 \n",
"\n",
" Year Notes Agency Status \n",
"0 2011 NaN San Francisco NaN \n",
"1 2011 NaN San Francisco NaN \n",
"2 2011 NaN San Francisco NaN \n",
"3 2011 NaN San Francisco NaN \n",
"4 2011 NaN San Francisco NaN \n",
"... ... ... ... ... \n",
"148649 2014 NaN San Francisco PT \n",
"148650 2014 NaN San Francisco NaN \n",
"148651 2014 NaN San Francisco NaN \n",
"148652 2014 NaN San Francisco NaN \n",
"148653 2014 NaN San Francisco PT \n",
"\n",
"[148654 rows x 13 columns]"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"df = pd.read_csv(\"Salaries.csv\")\n",
"names = ['last name', 'first name', 'gender', 'race']\n",
"# Males\n",
"black_male = pd.read_csv('Black-Male-Names.csv', names = names)\n",
"hispanic_male = pd.read_csv('Hispanic-Male-Names.csv', names = names)\n",
"white_male = pd.read_csv('White-Male-Names.csv', names = names)\n",
"# Females\n",
"black_female = pd.read_csv('Black-Female-Names.csv', names = names)\n",
"hispanic_female = pd.read_csv('Hispanic-Female-Names.csv', names = names)\n",
"white_female = pd.read_csv('White-Female-Names.csv', names = names)\n",
"# Need to split up name here\n",
"indian_male = pd.read_csv('Indian-Male-Names.csv')\n",
"indian_female = pd.read_csv('Indian-Female-Names.csv')\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>Last Name</th>\n",
" <th>First Name</th>\n",
" <th>JobTitle</th>\n",
" <th>TotalPay</th>\n",
" <th>Year</th>\n",
" <th>Agency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>ford</td>\n",
" <td>nathaniel</td>\n",
" <td>general manager-metropolitan transit authority</td>\n",
" <td>567595.43</td>\n",
" <td>2011</td>\n",
" <td>San Francisco</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>jimenez</td>\n",
" <td>gary</td>\n",
" <td>captain iii (police department)</td>\n",
" <td>538909.28</td>\n",
" <td>2011</td>\n",
" <td>San Francisco</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>pardini</td>\n",
" <td>albert</td>\n",
" <td>captain iii (police department)</td>\n",
" <td>335279.91</td>\n",
" <td>2011</td>\n",
" <td>San Francisco</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>chong</td>\n",
" <td>christopher</td>\n",
" <td>wire rope cable maintenance mechanic</td>\n",
" <td>332343.61</td>\n",
" <td>2011</td>\n",
" <td>San Francisco</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>gardner</td>\n",
" <td>patrick</td>\n",
" <td>deputy chief of department,(fire department)</td>\n",
" <td>326373.19</td>\n",
" <td>2011</td>\n",
" <td>San Francisco</td>\n",
" </tr>\n",
" <tr>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148649</td>\n",
" <td>148650</td>\n",
" <td>i</td>\n",
" <td>roy</td>\n",
" <td>custodian</td>\n",
" <td>0.00</td>\n",
" <td>2014</td>\n",
" <td>San Francisco</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148650</td>\n",
" <td>148651</td>\n",
" <td>provided</td>\n",
" <td>not</td>\n",
" <td>not provided</td>\n",
" <td>0.00</td>\n",
" <td>2014</td>\n",
" <td>San Francisco</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148651</td>\n",
" <td>148652</td>\n",
" <td>provided</td>\n",
" <td>not</td>\n",
" <td>not provided</td>\n",
" <td>0.00</td>\n",
" <td>2014</td>\n",
" <td>San Francisco</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148652</td>\n",
" <td>148653</td>\n",
" <td>provided</td>\n",
" <td>not</td>\n",
" <td>not provided</td>\n",
" <td>0.00</td>\n",
" <td>2014</td>\n",
" <td>San Francisco</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148653</td>\n",
" <td>148654</td>\n",
" <td>lopez</td>\n",
" <td>joe</td>\n",
" <td>counselor, log cabin ranch</td>\n",
" <td>-618.13</td>\n",
" <td>2014</td>\n",
" <td>San Francisco</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>148654 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" Id Last Name First Name \\\n",
"0 1 ford nathaniel \n",
"1 2 jimenez gary \n",
"2 3 pardini albert \n",
"3 4 chong christopher \n",
"4 5 gardner patrick \n",
"... ... ... ... \n",
"148649 148650 i roy \n",
"148650 148651 provided not \n",
"148651 148652 provided not \n",
"148652 148653 provided not \n",
"148653 148654 lopez joe \n",
"\n",
" JobTitle TotalPay Year \\\n",
"0 general manager-metropolitan transit authority 567595.43 2011 \n",
"1 captain iii (police department) 538909.28 2011 \n",
"2 captain iii (police department) 335279.91 2011 \n",
"3 wire rope cable maintenance mechanic 332343.61 2011 \n",
"4 deputy chief of department,(fire department) 326373.19 2011 \n",
"... ... ... ... \n",
"148649 custodian 0.00 2014 \n",
"148650 not provided 0.00 2014 \n",
"148651 not provided 0.00 2014 \n",
"148652 not provided 0.00 2014 \n",
"148653 counselor, log cabin ranch -618.13 2014 \n",
"\n",
" Agency \n",
"0 San Francisco \n",
"1 San Francisco \n",
"2 San Francisco \n",
"3 San Francisco \n",
"4 San Francisco \n",
"... ... \n",
"148649 San Francisco \n",
"148650 San Francisco \n",
"148651 San Francisco \n",
"148652 San Francisco \n",
"148653 San Francisco \n",
"\n",
"[148654 rows x 7 columns]"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns = ['Id','Last Name', 'First Name', 'JobTitle','TotalPay', 'Year', 'Agency']\n",
"x, y = df['EmployeeName'], df['JobTitle']\n",
"\n",
"df['JobTitle'] = y.str.strip().str.lower()\n",
"df['EmployeeName'] = x.str.strip().str.lower()\n",
"df['First Name'], df['Last Name'] = x.str.split().str[0], x.str.split().str[1]\n",
"del df['EmployeeName']\n",
"df = df[columns]\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Split"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"male_names = pd.concat([bm_names, hm_names, wm_names, im_names])\n",
"female_names = pd.concat([bf_names, hf_names, wf_names, if_names])\n",
"female_names.drop(female_names.index[[0]], inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"x = male_names['first name']\n",
"y = female_names['first name']\n",
"male_names['first name'], female_names['first name'] = x.str.split().str[0],y.str.split().str[0]"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" first name gender last name name race\n",
"1 aaric m aaron NaN black\n",
"2 dominique m aaron NaN black\n",
"3 fredrick m aaron NaN black\n",
"4 jarvis m aaron NaN black\n",
"5 lorenzo m aaron NaN black\n",
"(120774, 5)\n"
]
}
],
"source": [
"names_df = pd.concat([male_names, female_names])\n",
"names_df['first name'] = names_df['first name'].str.strip().str.lower()\n",
"names_df.drop([0], inplace = True)\n",
"print(names_df.head())\n",
"print(names_df.shape)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"male_names = pd.concat([bm_names, hm_names, wm_names, im_names])\n",
"female_names = pd.concat([bf_names, hf_names, wf_names, if_names])\n",
"female_names.drop(female_names.index[[0]], inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>first name</th>\n",
" <th>gender</th>\n",
" <th>last name</th>\n",
" <th>name</th>\n",
" <th>race</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>aaric a</td>\n",
" <td>m</td>\n",
" <td>aaron</td>\n",
" <td>NaN</td>\n",
" <td>black</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>dominique j</td>\n",
" <td>m</td>\n",
" <td>aaron</td>\n",
" <td>NaN</td>\n",
" <td>black</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>fredrick r</td>\n",
" <td>m</td>\n",
" <td>aaron</td>\n",
" <td>NaN</td>\n",
" <td>black</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>jarvis</td>\n",
" <td>m</td>\n",
" <td>aaron</td>\n",
" <td>NaN</td>\n",
" <td>black</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>lorenzo</td>\n",
" <td>m</td>\n",
" <td>aaron</td>\n",
" <td>NaN</td>\n",
" <td>black</td>\n",
" </tr>\n",
" <tr>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15377</td>\n",
" <td>NaN</td>\n",
" <td>f</td>\n",
" <td>NaN</td>\n",
" <td>saroj devi</td>\n",
" <td>indian</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15378</td>\n",
" <td>NaN</td>\n",
" <td>f</td>\n",
" <td>NaN</td>\n",
" <td>naina @ geeta</td>\n",
" <td>indian</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15379</td>\n",
" <td>NaN</td>\n",
" <td>f</td>\n",
" <td>NaN</td>\n",
" <td>manju d/0 baboo lal jatav</td>\n",
" <td>indian</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15380</td>\n",
" <td>NaN</td>\n",
" <td>f</td>\n",
" <td>NaN</td>\n",
" <td>shivani</td>\n",
" <td>indian</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15381</td>\n",
" <td>NaN</td>\n",
" <td>f</td>\n",
" <td>NaN</td>\n",
" <td>nayna</td>\n",
" <td>indian</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>120774 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" first name gender last name name race\n",
"1 aaric a m aaron NaN black\n",
"2 dominique j m aaron NaN black\n",
"3 fredrick r m aaron NaN black\n",
"4 jarvis m aaron NaN black\n",
"5 lorenzo m aaron NaN black\n",
"... ... ... ... ... ...\n",
"15377 NaN f NaN saroj devi indian\n",
"15378 NaN f NaN naina @ geeta indian\n",
"15379 NaN f NaN manju d/0 baboo lal jatav indian\n",
"15380 NaN f NaN shivani indian\n",
"15381 NaN f NaN nayna indian\n",
"\n",
"[120774 rows x 5 columns]"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"names_df = pd.concat([male_names, female_names])\n",
"names_df['first name'] = names_df['first name'].str.strip().str.lower()\n",
"names_df.drop([0], inplace = True)\n",
"names_df"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"names_df = names_df.reset_index().drop_duplicates(subset='first name', keep='last').set_index('first name')"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>Last Name</th>\n",
" <th>First Name</th>\n",
" <th>JobTitle</th>\n",
" <th>TotalPay</th>\n",
" <th>Year</th>\n",
" <th>Agency</th>\n",
" <th>gender</th>\n",
" <th>race</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>ford</td>\n",
" <td>nathaniel</td>\n",
" <td>general manager-metropolitan transit authority</td>\n",
" <td>567595.43</td>\n",
" <td>2011</td>\n",
" <td>San Francisco</td>\n",
" <td>m</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>jimenez</td>\n",
" <td>gary</td>\n",
" <td>captain iii (police department)</td>\n",
" <td>538909.28</td>\n",
" <td>2011</td>\n",
" <td>San Francisco</td>\n",
" <td>m</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>pardini</td>\n",
" <td>albert</td>\n",
" <td>captain iii (police department)</td>\n",
" <td>335279.91</td>\n",
" <td>2011</td>\n",
" <td>San Francisco</td>\n",
" <td>m</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>chong</td>\n",
" <td>christopher</td>\n",
" <td>wire rope cable maintenance mechanic</td>\n",
" <td>332343.61</td>\n",
" <td>2011</td>\n",
" <td>San Francisco</td>\n",
" <td>m</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>gardner</td>\n",
" <td>patrick</td>\n",
" <td>deputy chief of department,(fire department)</td>\n",
" <td>326373.19</td>\n",
" <td>2011</td>\n",
" <td>San Francisco</td>\n",
" <td>m</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148649</td>\n",
" <td>148650</td>\n",
" <td>i</td>\n",
" <td>roy</td>\n",
" <td>custodian</td>\n",
" <td>0.00</td>\n",
" <td>2014</td>\n",
" <td>San Francisco</td>\n",
" <td>m</td>\n",
" <td>white</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148650</td>\n",
" <td>148651</td>\n",
" <td>provided</td>\n",
" <td>not</td>\n",
" <td>not provided</td>\n",
" <td>0.00</td>\n",
" <td>2014</td>\n",
" <td>San Francisco</td>\n",
" <td>m</td>\n",
" <td>indian</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148651</td>\n",
" <td>148652</td>\n",
" <td>provided</td>\n",
" <td>not</td>\n",
" <td>not provided</td>\n",
" <td>0.00</td>\n",
" <td>2014</td>\n",
" <td>San Francisco</td>\n",
" <td>m</td>\n",
" <td>indian</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148652</td>\n",
" <td>148653</td>\n",
" <td>provided</td>\n",
" <td>not</td>\n",
" <td>not provided</td>\n",
" <td>0.00</td>\n",
" <td>2014</td>\n",
" <td>San Francisco</td>\n",
" <td>m</td>\n",
" <td>indian</td>\n",
" </tr>\n",
" <tr>\n",
" <td>148653</td>\n",
" <td>148654</td>\n",
" <td>lopez</td>\n",
" <td>joe</td>\n",
" <td>counselor, log cabin ranch</td>\n",
" <td>-618.13</td>\n",
" <td>2014</td>\n",
" <td>San Francisco</td>\n",
" <td>m</td>\n",
" <td>white</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>148654 rows × 9 columns</p>\n",
"</div>"
],
"text/plain": [
" Id Last Name First Name \\\n",
"0 1 ford nathaniel \n",
"1 2 jimenez gary \n",
"2 3 pardini albert \n",
"3 4 chong christopher \n",
"4 5 gardner patrick \n",
"... ... ... ... \n",
"148649 148650 i roy \n",
"148650 148651 provided not \n",
"148651 148652 provided not \n",
"148652 148653 provided not \n",
"148653 148654 lopez joe \n",
"\n",
" JobTitle TotalPay Year \\\n",
"0 general manager-metropolitan transit authority 567595.43 2011 \n",
"1 captain iii (police department) 538909.28 2011 \n",
"2 captain iii (police department) 335279.91 2011 \n",
"3 wire rope cable maintenance mechanic 332343.61 2011 \n",
"4 deputy chief of department,(fire department) 326373.19 2011 \n",
"... ... ... ... \n",
"148649 custodian 0.00 2014 \n",
"148650 not provided 0.00 2014 \n",
"148651 not provided 0.00 2014 \n",
"148652 not provided 0.00 2014 \n",
"148653 counselor, log cabin ranch -618.13 2014 \n",
"\n",
" Agency gender race \n",
"0 San Francisco m white \n",
"1 San Francisco m white \n",
"2 San Francisco m white \n",
"3 San Francisco m white \n",
"4 San Francisco m white \n",
"... ... ... ... \n",
"148649 San Francisco m white \n",
"148650 San Francisco m indian \n",
"148651 San Francisco m indian \n",
"148652 San Francisco m indian \n",
"148653 San Francisco m white \n",
"\n",
"[148654 rows x 9 columns]"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['gender'], df['race'] = df['First Name'].map(names_df.gender), df['First Name'].map(names_df.race)\n",
"all_groups = df.groupby(['Year'],as_index=False).mean()\n",
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment