Last active
September 20, 2018 10:12
-
-
Save dharmeshdev19/09851f4a33a52e128845cf8d856b8bca to your computer and use it in GitHub Desktop.
Pandas / 02 - Learning Pandas / 05 - Operations on DataFrame in Pandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>A</th>\n", | |
" <th>B</th>\n", | |
" <th>C</th>\n", | |
" <th>D</th>\n", | |
" <th>Fruits</th>\n", | |
" <th>Extra Data</th>\n", | |
" <th>G</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2016-07-01</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>apple</td>\n", | |
" <td>4</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2016-07-02</th>\n", | |
" <td>4</td>\n", | |
" <td>5</td>\n", | |
" <td>6</td>\n", | |
" <td>7</td>\n", | |
" <td>orange</td>\n", | |
" <td>7</td>\n", | |
" <td>107</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2016-07-03</th>\n", | |
" <td>8</td>\n", | |
" <td>9</td>\n", | |
" <td>10</td>\n", | |
" <td>11</td>\n", | |
" <td>banana</td>\n", | |
" <td>10</td>\n", | |
" <td>207</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2016-07-04</th>\n", | |
" <td>12</td>\n", | |
" <td>13</td>\n", | |
" <td>14</td>\n", | |
" <td>15</td>\n", | |
" <td>strawberry</td>\n", | |
" <td>13</td>\n", | |
" <td>307</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2016-07-05</th>\n", | |
" <td>16</td>\n", | |
" <td>17</td>\n", | |
" <td>18</td>\n", | |
" <td>19</td>\n", | |
" <td>blueberry</td>\n", | |
" <td>16</td>\n", | |
" <td>407</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2016-07-06</th>\n", | |
" <td>20</td>\n", | |
" <td>21</td>\n", | |
" <td>22</td>\n", | |
" <td>23</td>\n", | |
" <td>pineapple</td>\n", | |
" <td>19</td>\n", | |
" <td>507</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" A B C D Fruits Extra Data G\n", | |
"2016-07-01 0 1 2 3 apple 4 7\n", | |
"2016-07-02 4 5 6 7 orange 7 107\n", | |
"2016-07-03 8 9 10 11 banana 10 207\n", | |
"2016-07-04 12 13 14 15 strawberry 13 307\n", | |
"2016-07-05 16 17 18 19 blueberry 16 407\n", | |
"2016-07-06 20 21 22 23 pineapple 19 507" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"starting_date = '20160701'\n", | |
"sample_numpy_data = np.array(np.arange(24)).reshape((6,4))\n", | |
"dates_index = pd.date_range(starting_date, periods=6)\n", | |
"sample_df = pd.DataFrame(sample_numpy_data, index=dates_index, columns=list('ABCD'))\n", | |
"\n", | |
"sample_df_2 = sample_df.copy()\n", | |
"sample_df_2['Fruits'] = ['apple', 'orange','banana','strawberry','blueberry','pineapple']\n", | |
"\n", | |
"sample_series = pd.Series([1,2,3,4,5,6], index=pd.date_range(starting_date, periods=6))\n", | |
"sample_df_2['Extra Data'] = sample_series *3 +1\n", | |
"\n", | |
"second_numpy_array = np.array(np.arange(len(sample_df_2))) *100 + 7\n", | |
"sample_df_2['G'] = second_numpy_array\n", | |
"\n", | |
"sample_df_2" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### descriptive statistics" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>A</th>\n", | |
" <th>B</th>\n", | |
" <th>C</th>\n", | |
" <th>D</th>\n", | |
" <th>Extra Data</th>\n", | |
" <th>G</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>6.00</td>\n", | |
" <td>6.00</td>\n", | |
" <td>6.00</td>\n", | |
" <td>6.00</td>\n", | |
" <td>6.00</td>\n", | |
" <td>6.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>10.00</td>\n", | |
" <td>11.00</td>\n", | |
" <td>12.00</td>\n", | |
" <td>13.00</td>\n", | |
" <td>11.50</td>\n", | |
" <td>257.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>7.48</td>\n", | |
" <td>7.48</td>\n", | |
" <td>7.48</td>\n", | |
" <td>7.48</td>\n", | |
" <td>5.61</td>\n", | |
" <td>187.08</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>0.00</td>\n", | |
" <td>1.00</td>\n", | |
" <td>2.00</td>\n", | |
" <td>3.00</td>\n", | |
" <td>4.00</td>\n", | |
" <td>7.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>5.00</td>\n", | |
" <td>6.00</td>\n", | |
" <td>7.00</td>\n", | |
" <td>8.00</td>\n", | |
" <td>7.75</td>\n", | |
" <td>132.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>10.00</td>\n", | |
" <td>11.00</td>\n", | |
" <td>12.00</td>\n", | |
" <td>13.00</td>\n", | |
" <td>11.50</td>\n", | |
" <td>257.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>15.00</td>\n", | |
" <td>16.00</td>\n", | |
" <td>17.00</td>\n", | |
" <td>18.00</td>\n", | |
" <td>15.25</td>\n", | |
" <td>382.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>20.00</td>\n", | |
" <td>21.00</td>\n", | |
" <td>22.00</td>\n", | |
" <td>23.00</td>\n", | |
" <td>19.00</td>\n", | |
" <td>507.00</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" A B C D Extra Data G\n", | |
"count 6.00 6.00 6.00 6.00 6.00 6.00\n", | |
"mean 10.00 11.00 12.00 13.00 11.50 257.00\n", | |
"std 7.48 7.48 7.48 7.48 5.61 187.08\n", | |
"min 0.00 1.00 2.00 3.00 4.00 7.00\n", | |
"25% 5.00 6.00 7.00 8.00 7.75 132.00\n", | |
"50% 10.00 11.00 12.00 13.00 11.50 257.00\n", | |
"75% 15.00 16.00 17.00 18.00 15.25 382.00\n", | |
"max 20.00 21.00 22.00 23.00 19.00 507.00" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pd.set_option('display.precision', 2)\n", | |
"sample_df_2.describe()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Find each column mean" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"A 10.0\n", | |
"B 11.0\n", | |
"C 12.0\n", | |
"D 13.0\n", | |
"Extra Data 11.5\n", | |
"G 257.0\n", | |
"dtype: float64" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sample_df_2.mean()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"A 10.0\n", | |
"B 11.0\n", | |
"C 12.0\n", | |
"D 13.0\n", | |
"Extra Data 11.5\n", | |
"G 257.0\n", | |
"dtype: float64" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sample_df_2.mean(0) # same as above but we explicitly define the axis as 0 for column" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Find each row mean\n", | |
"documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.mean.html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"2016-07-01 2.83\n", | |
"2016-07-02 22.67\n", | |
"2016-07-03 42.50\n", | |
"2016-07-04 62.33\n", | |
"2016-07-05 82.17\n", | |
"2016-07-06 102.00\n", | |
"Freq: D, dtype: float64" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sample_df_2.mean(1) # 1 indicate that find mean on row level i.e. axis = 1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### apply (a function to a data frame)\n", | |
"documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>A</th>\n", | |
" <th>B</th>\n", | |
" <th>C</th>\n", | |
" <th>D</th>\n", | |
" <th>Fruits</th>\n", | |
" <th>Extra Data</th>\n", | |
" <th>G</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2016-07-01</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>apple</td>\n", | |
" <td>4</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2016-07-02</th>\n", | |
" <td>4</td>\n", | |
" <td>6</td>\n", | |
" <td>8</td>\n", | |
" <td>10</td>\n", | |
" <td>appleorange</td>\n", | |
" <td>11</td>\n", | |
" <td>114</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2016-07-03</th>\n", | |
" <td>12</td>\n", | |
" <td>15</td>\n", | |
" <td>18</td>\n", | |
" <td>21</td>\n", | |
" <td>appleorangebanana</td>\n", | |
" <td>21</td>\n", | |
" <td>321</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2016-07-04</th>\n", | |
" <td>24</td>\n", | |
" <td>28</td>\n", | |
" <td>32</td>\n", | |
" <td>36</td>\n", | |
" <td>appleorangebananastrawberry</td>\n", | |
" <td>34</td>\n", | |
" <td>628</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2016-07-05</th>\n", | |
" <td>40</td>\n", | |
" <td>45</td>\n", | |
" <td>50</td>\n", | |
" <td>55</td>\n", | |
" <td>appleorangebananastrawberryblueberry</td>\n", | |
" <td>50</td>\n", | |
" <td>1035</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2016-07-06</th>\n", | |
" <td>60</td>\n", | |
" <td>66</td>\n", | |
" <td>72</td>\n", | |
" <td>78</td>\n", | |
" <td>appleorangebananastrawberryblueberrypineapple</td>\n", | |
" <td>69</td>\n", | |
" <td>1542</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" A B C D Fruits \\\n", | |
"2016-07-01 0 1 2 3 apple \n", | |
"2016-07-02 4 6 8 10 appleorange \n", | |
"2016-07-03 12 15 18 21 appleorangebanana \n", | |
"2016-07-04 24 28 32 36 appleorangebananastrawberry \n", | |
"2016-07-05 40 45 50 55 appleorangebananastrawberryblueberry \n", | |
"2016-07-06 60 66 72 78 appleorangebananastrawberryblueberrypineapple \n", | |
"\n", | |
" Extra Data G \n", | |
"2016-07-01 4 7 \n", | |
"2016-07-02 11 114 \n", | |
"2016-07-03 21 321 \n", | |
"2016-07-04 34 628 \n", | |
"2016-07-05 50 1035 \n", | |
"2016-07-06 69 1542 " | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sample_df_2.apply(np.cumsum, axis=0) # perform cumilative sum on column basis " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### string methods\n", | |
"documentation: http://pandas.pydata.org/pandas-docs/stable/text.html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 a\n", | |
"1 b\n", | |
"2 c\n", | |
"3 aaba\n", | |
"4 baca\n", | |
"5 NaN\n", | |
"6 caba\n", | |
"7 dog\n", | |
"8 cat\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])\n", | |
"s.str.lower()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 1.0\n", | |
"1 1.0\n", | |
"2 1.0\n", | |
"3 4.0\n", | |
"4 4.0\n", | |
"5 NaN\n", | |
"6 4.0\n", | |
"7 3.0\n", | |
"8 3.0\n", | |
"dtype: float64" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"s.str.len() # gives string length of each element in the series array" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment