Skip to content

Instantly share code, notes, and snippets.

@Vikrant79
Created January 31, 2016 19:25
Show Gist options
  • Save Vikrant79/62b6a47e500f837260fa to your computer and use it in GitHub Desktop.
Save Vikrant79/62b6a47e500f837260fa to your computer and use it in GitHub Desktop.
Baby Name Challenge
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Import scikit learn libraries\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('Adding', 'AK.TXT')\n",
"('Adding', 'AL.TXT')\n",
"('Adding', 'AR.TXT')\n",
"('Adding', 'AZ.TXT')\n",
"('Adding', 'CA.TXT')\n",
"('Adding', 'CO.TXT')\n",
"('Adding', 'CT.TXT')\n",
"('Adding', 'DC.TXT')\n",
"('Adding', 'DE.TXT')\n",
"('Adding', 'FL.TXT')\n",
"('Adding', 'GA.TXT')\n",
"('Adding', 'HI.TXT')\n",
"('Adding', 'IA.TXT')\n",
"('Adding', 'ID.TXT')\n",
"('Adding', 'IL.TXT')\n",
"('Adding', 'IN.TXT')\n",
"('Adding', 'KS.TXT')\n",
"('Adding', 'KY.TXT')\n",
"('Adding', 'LA.TXT')\n",
"('Adding', 'MA.TXT')\n",
"('Adding', 'MD.TXT')\n",
"('Adding', 'ME.TXT')\n",
"('Adding', 'MI.TXT')\n",
"('Adding', 'MN.TXT')\n",
"('Adding', 'MO.TXT')\n",
"('Adding', 'MS.TXT')\n",
"('Adding', 'MT.TXT')\n",
"('Adding', 'NC.TXT')\n",
"('Adding', 'ND.TXT')\n",
"('Adding', 'NE.TXT')\n",
"('Adding', 'NH.TXT')\n",
"('Adding', 'NJ.TXT')\n",
"('Adding', 'NM.TXT')\n",
"('Adding', 'NV.TXT')\n",
"('Adding', 'NY.TXT')\n",
"('Adding', 'OH.TXT')\n",
"('Adding', 'OK.TXT')\n",
"('Adding', 'OR.TXT')\n",
"('Adding', 'PA.TXT')\n",
"('Adding', 'RI.TXT')\n",
"('Adding', 'SC.TXT')\n",
"('Adding', 'SD.TXT')\n",
"('Adding', 'TN.TXT')\n",
"('Adding', 'TX.TXT')\n",
"('Adding', 'UT.TXT')\n",
"('Adding', 'VA.TXT')\n",
"('Adding', 'VT.TXT')\n",
"('Adding', 'WA.TXT')\n",
"('Adding', 'WI.TXT')\n",
"('Adding', 'WV.TXT')\n",
"('Adding', 'WY.TXT')\n"
]
}
],
"source": [
"#Combine all the 51 state files into one csv file\n",
"from glob import glob\n",
"output = open(\"USA.csv\", \"w\")\n",
"output.write(\"state,sex,year,name,count\\n\")\n",
"for fname in glob(\"*.TXT\"):\n",
" print(\"Adding\", fname)\n",
" f = open(fname)\n",
" output.write(f.read())\n",
" f.close()\n",
"output.close()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Create Pandas DataFrame form consolidated csv file\n",
"Baby_Names = pd.read_csv('USA.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state</th>\n",
" <th>sex</th>\n",
" <th>year</th>\n",
" <th>name</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AK</td>\n",
" <td>F</td>\n",
" <td>1910</td>\n",
" <td>Mary</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AK</td>\n",
" <td>F</td>\n",
" <td>1910</td>\n",
" <td>Annie</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AK</td>\n",
" <td>F</td>\n",
" <td>1910</td>\n",
" <td>Anna</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AK</td>\n",
" <td>F</td>\n",
" <td>1910</td>\n",
" <td>Margaret</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AK</td>\n",
" <td>F</td>\n",
" <td>1910</td>\n",
" <td>Helen</td>\n",
" <td>7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state sex year name count\n",
"0 AK F 1910 Mary 14\n",
"1 AK F 1910 Annie 12\n",
"2 AK F 1910 Anna 10\n",
"3 AK F 1910 Margaret 8\n",
"4 AK F 1910 Helen 7"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Original Data Format\n",
"Baby_Names.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Create dummy variables for cols = ['state','sex','year'] and drop the original categorical columns\n",
"Baby_Names = pd.concat([Baby_Names, pd.get_dummies(Baby_Names['sex'])], axis=1)\n",
"Baby_Names = Baby_Names.drop('sex', 1)\n",
"Baby_Names = pd.concat([Baby_Names, pd.get_dummies(Baby_Names['year'])], axis=1)\n",
"Baby_Names = Baby_Names.drop('year', 1)\n",
"Baby_Names = pd.concat([Baby_Names, pd.get_dummies(Baby_Names['state'])], axis=1)\n",
"Baby_Names = Baby_Names.drop('state', 1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Create separate columns for male count, female count and total count for each row\n",
"Baby_Names['F_count'] = Baby_Names['F']*Baby_Names['count']\n",
"Baby_Names['M_count'] = Baby_Names['M']*Baby_Names['count']\n",
"Baby_Names['total_count'] = Baby_Names['F_count']+Baby_Names['M_count']"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Create separate column for total count of babies names for 1980 and 2014 year\n",
"Baby_Names['1980_count'] = Baby_Names[1980]*Baby_Names['count']\n",
"Baby_Names['2014_count'] = Baby_Names[2014]*Baby_Names['count']\n",
"\n",
"#Create separate column for total count of babies names for 1945 and 2013 year\n",
"Baby_Names['1945_count'] = Baby_Names[1945]*Baby_Names['count']\n",
"Baby_Names['2013_count'] = Baby_Names[2013]*Baby_Names['count']\n",
"\n",
"#Create separate column for total count of babies names by gender for 1945 and 2013 year\n",
"Baby_Names['1945_count_M'] = Baby_Names['1945_count']*Baby_Names['M']\n",
"Baby_Names['2013_count_M'] = Baby_Names['2013_count']*Baby_Names['M']\n",
"Baby_Names['1945_count_F'] = Baby_Names['1945_count']*Baby_Names['F']\n",
"Baby_Names['2013_count_F'] = Baby_Names['2013_count']*Baby_Names['F']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Drop the below 3 columns before we do groupby on names. These three columns are not required\n",
"Baby_Names = Baby_Names.drop('count', 1)\n",
"Baby_Names = Baby_Names.drop('F', 1)\n",
"Baby_Names = Baby_Names.drop('M', 1)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Groupby whole dataFrame baed on Name. It will consolidate all the similar name and make only 1 row for each name\n",
"Baby_Names = Baby_Names.groupby(['name'], as_index=False).sum()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Create a separate column for unique name (Not shared between male and female) count in 1945 and 2013\n",
"Baby_Names['unique_name_count_1945'] = Baby_Names['1945_count_F'] - Baby_Names['1945_count_M']\n",
"Baby_Names['unique_name_count_2013'] = Baby_Names['2013_count_F'] - Baby_Names['2013_count_M']"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Taking absolute value of unique name count\n",
"Baby_Names['unique_name_count_1945'] = Baby_Names['unique_name_count_1945'].abs()\n",
"Baby_Names['unique_name_count_2013'] = Baby_Names['unique_name_count_2013'].abs()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Calculating the common name percent\n",
"Baby_Names['common_name_percent_1945'] = 100 - Baby_Names['unique_name_count_1945']/Baby_Names['1945_count']*100\n",
"Baby_Names['common_name_percent_2013'] = 100 - Baby_Names['unique_name_count_2013']/Baby_Names['2013_count']*100"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#if the same name is shared between male and female then common name count will be either male count or female count whichever lesser\n",
"Baby_Names['common_name_count_1945'] = Baby_Names.apply(lambda x : x['1945_count_M'] if x['1945_count_F'] >= x['1945_count_M'] else x['1945_count_F'], axis=1)\n",
"Baby_Names['common_name_count_2013'] = Baby_Names.apply(lambda x : x['2013_count_M'] if x['2013_count_F'] >= x['2013_count_M'] else x['2013_count_F'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Baby name count difference between 2014 and 1980\n",
"Baby_Names['count_diff_1980_to_2014'] = Baby_Names['2014_count'] - Baby_Names['1980_count']"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Calculate percent difference in count of a name between 2014 and 1980\n",
"Baby_Names['percent_diff_1980_to_2014'] = Baby_Names['count_diff_1980_to_2014']/Baby_Names['1980_count']*100"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Some of the values will be coming as infinite in 'percent_diff_1980_to_2014' if name count in 1980 as 0. Replacing \n",
"#all infinite values with 0 to ignore all such cases\n",
"Baby_Names = Baby_Names.replace([np.inf, -np.inf], np.nan)\n",
"Baby_Names = Baby_Names.fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Saving the file to feed into tableau to generate the graphs of the given baby name data\n",
"Baby_Names.to_csv('Baby_Names_tableau.csv', index = False)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>total_count</th>\n",
" <th>F_count</th>\n",
" <th>M_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>12442</th>\n",
" <td>James</td>\n",
" <td>4957166</td>\n",
" <td>18201</td>\n",
" <td>4938965</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13761</th>\n",
" <td>John</td>\n",
" <td>4845414</td>\n",
" <td>15681</td>\n",
" <td>4829733</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23582</th>\n",
" <td>Robert</td>\n",
" <td>4725713</td>\n",
" <td>15113</td>\n",
" <td>4710600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19955</th>\n",
" <td>Michael</td>\n",
" <td>4312975</td>\n",
" <td>17196</td>\n",
" <td>4295779</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29049</th>\n",
" <td>William</td>\n",
" <td>3839236</td>\n",
" <td>10210</td>\n",
" <td>3829026</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19273</th>\n",
" <td>Mary</td>\n",
" <td>3740495</td>\n",
" <td>3730856</td>\n",
" <td>9639</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6800</th>\n",
" <td>David</td>\n",
" <td>3562278</td>\n",
" <td>8176</td>\n",
" <td>3554102</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name total_count F_count M_count\n",
"12442 James 4957166 18201 4938965\n",
"13761 John 4845414 15681 4829733\n",
"23582 Robert 4725713 15113 4710600\n",
"19955 Michael 4312975 17196 4295779\n",
"29049 William 3839236 10210 3829026\n",
"19273 Mary 3740495 3730856 9639\n",
"6800 David 3562278 8176 3554102"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Question :- What is the most popular name of all time? (Of either gender.)\n",
"Baby_Names_sort = Baby_Names.sort(columns='total_count', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')\n",
"Baby_Names_sort[['name','total_count','F_count','M_count']].head(7)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>common_name_count_1945</th>\n",
" <th>1945_count_F</th>\n",
" <th>1945_count_M</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>29053</th>\n",
" <td>Willie</td>\n",
" <td>1745</td>\n",
" <td>1745</td>\n",
" <td>7008</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17369</th>\n",
" <td>Leslie</td>\n",
" <td>1678</td>\n",
" <td>1678</td>\n",
" <td>1976</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11996</th>\n",
" <td>Jackie</td>\n",
" <td>1245</td>\n",
" <td>1245</td>\n",
" <td>1472</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18175</th>\n",
" <td>Lynn</td>\n",
" <td>1194</td>\n",
" <td>2713</td>\n",
" <td>1194</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27348</th>\n",
" <td>Terry</td>\n",
" <td>927</td>\n",
" <td>927</td>\n",
" <td>6825</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13798</th>\n",
" <td>Johnnie</td>\n",
" <td>915</td>\n",
" <td>915</td>\n",
" <td>1570</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13485</th>\n",
" <td>Jessie</td>\n",
" <td>910</td>\n",
" <td>1072</td>\n",
" <td>910</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name common_name_count_1945 1945_count_F 1945_count_M\n",
"29053 Willie 1745 1745 7008\n",
"17369 Leslie 1678 1678 1976\n",
"11996 Jackie 1245 1245 1472\n",
"18175 Lynn 1194 2713 1194\n",
"27348 Terry 927 927 6825\n",
"13798 Johnnie 915 915 1570\n",
"13485 Jessie 910 1072 910"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Question :- What is the most gender ambiguous name in 1945? (By count)\n",
"Baby_Names_sort = Baby_Names.sort(columns='common_name_count_1945', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')\n",
"Baby_Names_sort[['name','common_name_count_1945','1945_count_F','1945_count_M']].head(7)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>common_name_percent_1945</th>\n",
" <th>1945_count_F</th>\n",
" <th>1945_count_M</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>19488</th>\n",
" <td>Maxie</td>\n",
" <td>100.000000</td>\n",
" <td>19</td>\n",
" <td>19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2594</th>\n",
" <td>Artie</td>\n",
" <td>97.777778</td>\n",
" <td>23</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17034</th>\n",
" <td>Lavern</td>\n",
" <td>97.222222</td>\n",
" <td>70</td>\n",
" <td>74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9771</th>\n",
" <td>Frankie</td>\n",
" <td>93.501455</td>\n",
" <td>482</td>\n",
" <td>549</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3309</th>\n",
" <td>Barrie</td>\n",
" <td>93.333333</td>\n",
" <td>8</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11226</th>\n",
" <td>Hillary</td>\n",
" <td>92.307692</td>\n",
" <td>7</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4574</th>\n",
" <td>Cameron</td>\n",
" <td>92.307692</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name common_name_percent_1945 1945_count_F 1945_count_M\n",
"19488 Maxie 100.000000 19 19\n",
"2594 Artie 97.777778 23 22\n",
"17034 Lavern 97.222222 70 74\n",
"9771 Frankie 93.501455 482 549\n",
"3309 Barrie 93.333333 8 7\n",
"11226 Hillary 92.307692 7 6\n",
"4574 Cameron 92.307692 6 7"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Question :- What is the most gender ambiguous name in 1945? (By Percentage %) \n",
"Baby_Names_sort = Baby_Names.sort(columns='common_name_percent_1945', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')\n",
"Baby_Names_sort[['name','common_name_percent_1945','1945_count_F','1945_count_M']].head(7)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>common_name_count_2013</th>\n",
" <th>2013_count_F</th>\n",
" <th>2013_count_M</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>23504</th>\n",
" <td>Riley</td>\n",
" <td>2544</td>\n",
" <td>4921</td>\n",
" <td>2544</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3000</th>\n",
" <td>Avery</td>\n",
" <td>2036</td>\n",
" <td>9162</td>\n",
" <td>2036</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22370</th>\n",
" <td>Peyton</td>\n",
" <td>1826</td>\n",
" <td>4553</td>\n",
" <td>1826</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10988</th>\n",
" <td>Hayden</td>\n",
" <td>1668</td>\n",
" <td>1668</td>\n",
" <td>2951</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5182</th>\n",
" <td>Charlie</td>\n",
" <td>1303</td>\n",
" <td>1303</td>\n",
" <td>1536</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13909</th>\n",
" <td>Jordan</td>\n",
" <td>1206</td>\n",
" <td>1206</td>\n",
" <td>7167</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22156</th>\n",
" <td>Parker</td>\n",
" <td>1180</td>\n",
" <td>1180</td>\n",
" <td>5651</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name common_name_count_2013 2013_count_F 2013_count_M\n",
"23504 Riley 2544 4921 2544\n",
"3000 Avery 2036 9162 2036\n",
"22370 Peyton 1826 4553 1826\n",
"10988 Hayden 1668 1668 2951\n",
"5182 Charlie 1303 1303 1536\n",
"13909 Jordan 1206 1206 7167\n",
"22156 Parker 1180 1180 5651"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Question :- What is the most gender ambiguous name in 2013? (By count)\n",
"Baby_Names_sort = Baby_Names.sort(columns='common_name_count_2013', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')\n",
"Baby_Names_sort[['name','common_name_count_2013','2013_count_F','2013_count_M']].head(7)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>common_name_percent_2013</th>\n",
" <th>2013_count_F</th>\n",
" <th>2013_count_M</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>21355</th>\n",
" <td>Nikita</td>\n",
" <td>100.000000</td>\n",
" <td>47</td>\n",
" <td>47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26108</th>\n",
" <td>Sonam</td>\n",
" <td>100.000000</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2468</th>\n",
" <td>Arlin</td>\n",
" <td>100.000000</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6094</th>\n",
" <td>Cree</td>\n",
" <td>100.000000</td>\n",
" <td>11</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7484</th>\n",
" <td>Devine</td>\n",
" <td>100.000000</td>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12103</th>\n",
" <td>Jael</td>\n",
" <td>97.637795</td>\n",
" <td>124</td>\n",
" <td>130</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20097</th>\n",
" <td>Milan</td>\n",
" <td>97.149373</td>\n",
" <td>426</td>\n",
" <td>451</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name common_name_percent_2013 2013_count_F 2013_count_M\n",
"21355 Nikita 100.000000 47 47\n",
"26108 Sonam 100.000000 5 5\n",
"2468 Arlin 100.000000 5 5\n",
"6094 Cree 100.000000 11 11\n",
"7484 Devine 100.000000 10 10\n",
"12103 Jael 97.637795 124 130\n",
"20097 Milan 97.149373 426 451"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Question :- What is the most gender ambiguous name in 2013? (By Percentage %) \n",
"Baby_Names_sort = Baby_Names.sort(columns='common_name_percent_2013', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')\n",
"Baby_Names_sort[['name','common_name_percent_2013','2013_count_F','2013_count_M']].head(7)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>percent_diff_1980_to_2014</th>\n",
" <th>1980_count</th>\n",
" <th>2014_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>5872</th>\n",
" <td>Colton</td>\n",
" <td>126600.000000</td>\n",
" <td>5</td>\n",
" <td>6335</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2343</th>\n",
" <td>Aria</td>\n",
" <td>117920.000000</td>\n",
" <td>5</td>\n",
" <td>5901</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26014</th>\n",
" <td>Skylar</td>\n",
" <td>101680.000000</td>\n",
" <td>5</td>\n",
" <td>5089</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20089</th>\n",
" <td>Mila</td>\n",
" <td>80480.000000</td>\n",
" <td>5</td>\n",
" <td>4029</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19387</th>\n",
" <td>Mateo</td>\n",
" <td>73740.000000</td>\n",
" <td>5</td>\n",
" <td>3692</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11710</th>\n",
" <td>Isabella</td>\n",
" <td>73595.652174</td>\n",
" <td>23</td>\n",
" <td>16950</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5936</th>\n",
" <td>Cooper</td>\n",
" <td>69514.285714</td>\n",
" <td>7</td>\n",
" <td>4873</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name percent_diff_1980_to_2014 1980_count 2014_count\n",
"5872 Colton 126600.000000 5 6335\n",
"2343 Aria 117920.000000 5 5901\n",
"26014 Skylar 101680.000000 5 5089\n",
"20089 Mila 80480.000000 5 4029\n",
"19387 Mateo 73740.000000 5 3692\n",
"11710 Isabella 73595.652174 23 16950\n",
"5936 Cooper 69514.285714 7 4873"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Question :- Find the name that has had the largest percentage increase in popularity since 1980. \n",
"Baby_Names_sort = Baby_Names.sort(columns='percent_diff_1980_to_2014', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')\n",
"Baby_Names_sort[['name','percent_diff_1980_to_2014','1980_count','2014_count']].head(7)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>percent_diff_1980_to_2014</th>\n",
" <th>1980_count</th>\n",
" <th>2014_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>27741</th>\n",
" <td>Tonya</td>\n",
" <td>-100</td>\n",
" <td>3073</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16229</th>\n",
" <td>Kristi</td>\n",
" <td>-100</td>\n",
" <td>2521</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26991</th>\n",
" <td>Tasha</td>\n",
" <td>-100</td>\n",
" <td>2256</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16887</th>\n",
" <td>Latasha</td>\n",
" <td>-100</td>\n",
" <td>2041</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25135</th>\n",
" <td>Shanna</td>\n",
" <td>-100</td>\n",
" <td>1508</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25389</th>\n",
" <td>Shawna</td>\n",
" <td>-100</td>\n",
" <td>1420</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26822</th>\n",
" <td>Tamika</td>\n",
" <td>-100</td>\n",
" <td>1251</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name percent_diff_1980_to_2014 1980_count 2014_count\n",
"27741 Tonya -100 3073 0\n",
"16229 Kristi -100 2521 0\n",
"26991 Tasha -100 2256 0\n",
"16887 Latasha -100 2041 0\n",
"25135 Shanna -100 1508 0\n",
"25389 Shawna -100 1420 0\n",
"26822 Tamika -100 1251 0"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Question :- Find the name that has had the largest percentage decrease in popularity since 1980.\n",
"Baby_Names_sort = Baby_Names.sort(['percent_diff_1980_to_2014','1980_count'], ascending=[True, False], inplace=False, kind='quicksort', na_position='last')\n",
"Baby_Names_sort[['name','percent_diff_1980_to_2014','1980_count','2014_count']].head(7)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>count_diff_1980_to_2014</th>\n",
" <th>1980_count</th>\n",
" <th>2014_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8796</th>\n",
" <td>Emma</td>\n",
" <td>20317</td>\n",
" <td>482</td>\n",
" <td>20799</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21796</th>\n",
" <td>Olivia</td>\n",
" <td>18584</td>\n",
" <td>1090</td>\n",
" <td>19674</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21470</th>\n",
" <td>Noah</td>\n",
" <td>18299</td>\n",
" <td>899</td>\n",
" <td>19198</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17470</th>\n",
" <td>Liam</td>\n",
" <td>18276</td>\n",
" <td>71</td>\n",
" <td>18347</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26138</th>\n",
" <td>Sophia</td>\n",
" <td>17911</td>\n",
" <td>584</td>\n",
" <td>18495</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11710</th>\n",
" <td>Isabella</td>\n",
" <td>16927</td>\n",
" <td>23</td>\n",
" <td>16950</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19365</th>\n",
" <td>Mason</td>\n",
" <td>16820</td>\n",
" <td>297</td>\n",
" <td>17117</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name count_diff_1980_to_2014 1980_count 2014_count\n",
"8796 Emma 20317 482 20799\n",
"21796 Olivia 18584 1090 19674\n",
"21470 Noah 18299 899 19198\n",
"17470 Liam 18276 71 18347\n",
"26138 Sophia 17911 584 18495\n",
"11710 Isabella 16927 23 16950\n",
"19365 Mason 16820 297 17117"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Question :- Find the name that has had the largest increase by numbers in popularity since 1980. \n",
"Baby_Names_sort = Baby_Names.sort(columns='count_diff_1980_to_2014', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')\n",
"Baby_Names_sort[['name','count_diff_1980_to_2014','1980_count','2014_count']].head(7)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>count_diff_1980_to_2014</th>\n",
" <th>1980_count</th>\n",
" <th>2014_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>13288</th>\n",
" <td>Jennifer</td>\n",
" <td>-57034</td>\n",
" <td>58533</td>\n",
" <td>1499</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19955</th>\n",
" <td>Michael</td>\n",
" <td>-53838</td>\n",
" <td>69166</td>\n",
" <td>15328</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12830</th>\n",
" <td>Jason</td>\n",
" <td>-42926</td>\n",
" <td>48436</td>\n",
" <td>5510</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5523</th>\n",
" <td>Christopher</td>\n",
" <td>-39065</td>\n",
" <td>49343</td>\n",
" <td>10278</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1384</th>\n",
" <td>Amanda</td>\n",
" <td>-34833</td>\n",
" <td>35845</td>\n",
" <td>1012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13479</th>\n",
" <td>Jessica</td>\n",
" <td>-32234</td>\n",
" <td>33988</td>\n",
" <td>1754</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19774</th>\n",
" <td>Melissa</td>\n",
" <td>-30261</td>\n",
" <td>31727</td>\n",
" <td>1466</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name count_diff_1980_to_2014 1980_count 2014_count\n",
"13288 Jennifer -57034 58533 1499\n",
"19955 Michael -53838 69166 15328\n",
"12830 Jason -42926 48436 5510\n",
"5523 Christopher -39065 49343 10278\n",
"1384 Amanda -34833 35845 1012\n",
"13479 Jessica -32234 33988 1754\n",
"19774 Melissa -30261 31727 1466"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Question :- Find the name that has had the largest decrease by numbers in popularity since 1980. \n",
"Baby_Names_sort = Baby_Names.sort(columns='count_diff_1980_to_2014', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')\n",
"Baby_Names_sort[['name','count_diff_1980_to_2014','1980_count','2014_count']].head(7)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>2014_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8796</th>\n",
" <td>Emma</td>\n",
" <td>20799</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21796</th>\n",
" <td>Olivia</td>\n",
" <td>19674</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21470</th>\n",
" <td>Noah</td>\n",
" <td>19198</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26138</th>\n",
" <td>Sophia</td>\n",
" <td>18495</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17470</th>\n",
" <td>Liam</td>\n",
" <td>18347</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19365</th>\n",
" <td>Mason</td>\n",
" <td>17117</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11710</th>\n",
" <td>Isabella</td>\n",
" <td>16950</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12008</th>\n",
" <td>Jacob</td>\n",
" <td>16712</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29049</th>\n",
" <td>William</td>\n",
" <td>16687</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9136</th>\n",
" <td>Ethan</td>\n",
" <td>15619</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name 2014_count\n",
"8796 Emma 20799\n",
"21796 Olivia 19674\n",
"21470 Noah 19198\n",
"26138 Sophia 18495\n",
"17470 Liam 18347\n",
"19365 Mason 17117\n",
"11710 Isabella 16950\n",
"12008 Jacob 16712\n",
"29049 William 16687\n",
"9136 Ethan 15619"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Question :- Most Popular Names in 2014 \n",
"Baby_Names_sort = Baby_Names.sort(columns='2014_count', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')\n",
"Baby_Names_sort[['name','2014_count']].head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment