Last active
September 20, 2018 10:15
-
-
Save dharmeshdev19/4411a1389b85965e19495ff9901894ef to your computer and use it in GitHub Desktop.
Pandas / 02 - Learning Pandas / 10 - Working with Categorical Data in Pandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Categorical Data\n", | |
"\n", | |
"Categoricals are a pandas data type, which correspond to categorical variables in statistics: a variable, which can take\n", | |
"on only a limited, and usually fixed, number of possible values. Examples are gender, social\n", | |
"class, blood types, country affiliations, observation time or ratings via Likert scales.\n", | |
"\n", | |
"In contrast to statistical categorical variables, categorical data might have an order (e.g. ‘strongly agree’ vs ‘agree’ or\n", | |
"‘first observation’ vs. ‘second observation’), but numerical operations (additions, divisions, ...) are not possible.\n", | |
"\n", | |
"All values of categorical data are either in categories or np.nan. Order is defined by the order of categories, not lexical\n", | |
"order of the values.\n", | |
"\n", | |
"documentation: http://pandas.pydata.org/pandas-docs/stable/categorical.html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"file_name_string = 'D:/Project Data/EmployeesWithGrades.xlsx' # Make sure the sample file is at the target destination, else the code won't work\n", | |
"employees_df = pd.read_excel(file_name_string, 'Sheet1', index_col=None, na_values=['NA'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Department</th>\n", | |
" <th>Name</th>\n", | |
" <th>YearsOfService</th>\n", | |
" <th>Grade</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Able</td>\n", | |
" <td>4</td>\n", | |
" <td>a</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Baker</td>\n", | |
" <td>7</td>\n", | |
" <td>b</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Charlie</td>\n", | |
" <td>12</td>\n", | |
" <td>c</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Delta</td>\n", | |
" <td>1</td>\n", | |
" <td>d</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Echo</td>\n", | |
" <td>15</td>\n", | |
" <td>f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Foxtrot</td>\n", | |
" <td>9</td>\n", | |
" <td>a</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Golf</td>\n", | |
" <td>3</td>\n", | |
" <td>b</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Hotel</td>\n", | |
" <td>1</td>\n", | |
" <td>c</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>India</td>\n", | |
" <td>2</td>\n", | |
" <td>d</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Juliet</td>\n", | |
" <td>5</td>\n", | |
" <td>f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Kilo</td>\n", | |
" <td>7</td>\n", | |
" <td>a</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Lima</td>\n", | |
" <td>11</td>\n", | |
" <td>b</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Mike</td>\n", | |
" <td>2</td>\n", | |
" <td>c</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>November</td>\n", | |
" <td>3</td>\n", | |
" <td>d</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Oscar</td>\n", | |
" <td>4</td>\n", | |
" <td>f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Papa</td>\n", | |
" <td>9</td>\n", | |
" <td>a</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Quebec</td>\n", | |
" <td>1</td>\n", | |
" <td>b</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Romeo</td>\n", | |
" <td>1</td>\n", | |
" <td>c</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Sierra</td>\n", | |
" <td>1</td>\n", | |
" <td>d</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Tango</td>\n", | |
" <td>7</td>\n", | |
" <td>f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Uniform</td>\n", | |
" <td>5</td>\n", | |
" <td>a</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Victor</td>\n", | |
" <td>19</td>\n", | |
" <td>b</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Whiskey</td>\n", | |
" <td>2</td>\n", | |
" <td>c</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Xray</td>\n", | |
" <td>3</td>\n", | |
" <td>d</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Yankee</td>\n", | |
" <td>8</td>\n", | |
" <td>f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Zulu</td>\n", | |
" <td>17</td>\n", | |
" <td>a</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Department Name YearsOfService Grade\n", | |
"0 Marketing Able 4 a\n", | |
"1 Engineering Baker 7 b\n", | |
"2 Accounting Charlie 12 c\n", | |
"3 Marketing Delta 1 d\n", | |
"4 Engineering Echo 15 f\n", | |
"5 Accounting Foxtrot 9 a\n", | |
"6 Marketing Golf 3 b\n", | |
"7 Engineering Hotel 1 c\n", | |
"8 Accounting India 2 d\n", | |
"9 Marketing Juliet 5 f\n", | |
"10 Engineering Kilo 7 a\n", | |
"11 Accounting Lima 11 b\n", | |
"12 Marketing Mike 2 c\n", | |
"13 Engineering November 3 d\n", | |
"14 Accounting Oscar 4 f\n", | |
"15 Marketing Papa 9 a\n", | |
"16 Engineering Quebec 1 b\n", | |
"17 Accounting Romeo 1 c\n", | |
"18 Marketing Sierra 1 d\n", | |
"19 Engineering Tango 7 f\n", | |
"20 Accounting Uniform 5 a\n", | |
"21 Marketing Victor 19 b\n", | |
"22 Engineering Whiskey 2 c\n", | |
"23 Accounting Xray 3 d\n", | |
"24 Marketing Yankee 8 f\n", | |
"25 Engineering Zulu 17 a" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"employees_df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Change data type\n", | |
"change data type for \"Grade\" column to category\n", | |
"\n", | |
"documentation for astype(): http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.astype.html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"employees_df[\"Grade\"] = employees_df[\"Grade\"].astype(\"category\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Rename the categories\n", | |
"Rename the categories to more meaningful names (assigning to Series.cat.categories is inplace)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"employees_df[\"Grade\"].cat.categories = [\"excellent\", \"good\", \"acceptable\", \"poor\", \"unacceptable\"]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Values in data frame have not changed" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Department</th>\n", | |
" <th>Name</th>\n", | |
" <th>YearsOfService</th>\n", | |
" <th>Grade</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Able</td>\n", | |
" <td>4</td>\n", | |
" <td>excellent</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Baker</td>\n", | |
" <td>7</td>\n", | |
" <td>good</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Charlie</td>\n", | |
" <td>12</td>\n", | |
" <td>acceptable</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Delta</td>\n", | |
" <td>1</td>\n", | |
" <td>poor</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Echo</td>\n", | |
" <td>15</td>\n", | |
" <td>unacceptable</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Foxtrot</td>\n", | |
" <td>9</td>\n", | |
" <td>excellent</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Golf</td>\n", | |
" <td>3</td>\n", | |
" <td>good</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Hotel</td>\n", | |
" <td>1</td>\n", | |
" <td>acceptable</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>India</td>\n", | |
" <td>2</td>\n", | |
" <td>poor</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Juliet</td>\n", | |
" <td>5</td>\n", | |
" <td>unacceptable</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Kilo</td>\n", | |
" <td>7</td>\n", | |
" <td>excellent</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Lima</td>\n", | |
" <td>11</td>\n", | |
" <td>good</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Mike</td>\n", | |
" <td>2</td>\n", | |
" <td>acceptable</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>November</td>\n", | |
" <td>3</td>\n", | |
" <td>poor</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Oscar</td>\n", | |
" <td>4</td>\n", | |
" <td>unacceptable</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Papa</td>\n", | |
" <td>9</td>\n", | |
" <td>excellent</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Quebec</td>\n", | |
" <td>1</td>\n", | |
" <td>good</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Romeo</td>\n", | |
" <td>1</td>\n", | |
" <td>acceptable</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Sierra</td>\n", | |
" <td>1</td>\n", | |
" <td>poor</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Tango</td>\n", | |
" <td>7</td>\n", | |
" <td>unacceptable</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Uniform</td>\n", | |
" <td>5</td>\n", | |
" <td>excellent</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Victor</td>\n", | |
" <td>19</td>\n", | |
" <td>good</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Whiskey</td>\n", | |
" <td>2</td>\n", | |
" <td>acceptable</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td>Accounting</td>\n", | |
" <td>Xray</td>\n", | |
" <td>3</td>\n", | |
" <td>poor</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24</th>\n", | |
" <td>Marketing</td>\n", | |
" <td>Yankee</td>\n", | |
" <td>8</td>\n", | |
" <td>unacceptable</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td>Engineering</td>\n", | |
" <td>Zulu</td>\n", | |
" <td>17</td>\n", | |
" <td>excellent</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Department Name YearsOfService Grade\n", | |
"0 Marketing Able 4 excellent\n", | |
"1 Engineering Baker 7 good\n", | |
"2 Accounting Charlie 12 acceptable\n", | |
"3 Marketing Delta 1 poor\n", | |
"4 Engineering Echo 15 unacceptable\n", | |
"5 Accounting Foxtrot 9 excellent\n", | |
"6 Marketing Golf 3 good\n", | |
"7 Engineering Hotel 1 acceptable\n", | |
"8 Accounting India 2 poor\n", | |
"9 Marketing Juliet 5 unacceptable\n", | |
"10 Engineering Kilo 7 excellent\n", | |
"11 Accounting Lima 11 good\n", | |
"12 Marketing Mike 2 acceptable\n", | |
"13 Engineering November 3 poor\n", | |
"14 Accounting Oscar 4 unacceptable\n", | |
"15 Marketing Papa 9 excellent\n", | |
"16 Engineering Quebec 1 good\n", | |
"17 Accounting Romeo 1 acceptable\n", | |
"18 Marketing Sierra 1 poor\n", | |
"19 Engineering Tango 7 unacceptable\n", | |
"20 Accounting Uniform 5 excellent\n", | |
"21 Marketing Victor 19 good\n", | |
"22 Engineering Whiskey 2 acceptable\n", | |
"23 Accounting Xray 3 poor\n", | |
"24 Marketing Yankee 8 unacceptable\n", | |
"25 Engineering Zulu 17 excellent" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"employees_df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"tabulate Department, Name, and YearsOfService, by Grade" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Department</th>\n", | |
" <th>Name</th>\n", | |
" <th>YearsOfService</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Grade</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>excellent</th>\n", | |
" <td>6</td>\n", | |
" <td>6</td>\n", | |
" <td>6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>good</th>\n", | |
" <td>5</td>\n", | |
" <td>5</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>acceptable</th>\n", | |
" <td>5</td>\n", | |
" <td>5</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>poor</th>\n", | |
" <td>5</td>\n", | |
" <td>5</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>unacceptable</th>\n", | |
" <td>5</td>\n", | |
" <td>5</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Department Name YearsOfService\n", | |
"Grade \n", | |
"excellent 6 6 6\n", | |
"good 5 5 5\n", | |
"acceptable 5 5 5\n", | |
"poor 5 5 5\n", | |
"unacceptable 5 5 5" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"employees_df.groupby('Grade').count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment