Skip to content

Instantly share code, notes, and snippets.

@dharmeshdev19
Last active September 20, 2018 10:12
Show Gist options
  • Save dharmeshdev19/09851f4a33a52e128845cf8d856b8bca to your computer and use it in GitHub Desktop.
Save dharmeshdev19/09851f4a33a52e128845cf8d856b8bca to your computer and use it in GitHub Desktop.
Pandas / 02 - Learning Pandas / 05 - Operations on DataFrame in Pandas
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>Fruits</th>\n",
" <th>Extra Data</th>\n",
" <th>G</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2016-07-01</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>apple</td>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-02</th>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>orange</td>\n",
" <td>7</td>\n",
" <td>107</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-03</th>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>10</td>\n",
" <td>11</td>\n",
" <td>banana</td>\n",
" <td>10</td>\n",
" <td>207</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-04</th>\n",
" <td>12</td>\n",
" <td>13</td>\n",
" <td>14</td>\n",
" <td>15</td>\n",
" <td>strawberry</td>\n",
" <td>13</td>\n",
" <td>307</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-05</th>\n",
" <td>16</td>\n",
" <td>17</td>\n",
" <td>18</td>\n",
" <td>19</td>\n",
" <td>blueberry</td>\n",
" <td>16</td>\n",
" <td>407</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-06</th>\n",
" <td>20</td>\n",
" <td>21</td>\n",
" <td>22</td>\n",
" <td>23</td>\n",
" <td>pineapple</td>\n",
" <td>19</td>\n",
" <td>507</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D Fruits Extra Data G\n",
"2016-07-01 0 1 2 3 apple 4 7\n",
"2016-07-02 4 5 6 7 orange 7 107\n",
"2016-07-03 8 9 10 11 banana 10 207\n",
"2016-07-04 12 13 14 15 strawberry 13 307\n",
"2016-07-05 16 17 18 19 blueberry 16 407\n",
"2016-07-06 20 21 22 23 pineapple 19 507"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"starting_date = '20160701'\n",
"sample_numpy_data = np.array(np.arange(24)).reshape((6,4))\n",
"dates_index = pd.date_range(starting_date, periods=6)\n",
"sample_df = pd.DataFrame(sample_numpy_data, index=dates_index, columns=list('ABCD'))\n",
"\n",
"sample_df_2 = sample_df.copy()\n",
"sample_df_2['Fruits'] = ['apple', 'orange','banana','strawberry','blueberry','pineapple']\n",
"\n",
"sample_series = pd.Series([1,2,3,4,5,6], index=pd.date_range(starting_date, periods=6))\n",
"sample_df_2['Extra Data'] = sample_series *3 +1\n",
"\n",
"second_numpy_array = np.array(np.arange(len(sample_df_2))) *100 + 7\n",
"sample_df_2['G'] = second_numpy_array\n",
"\n",
"sample_df_2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### descriptive statistics"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>Extra Data</th>\n",
" <th>G</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>6.00</td>\n",
" <td>6.00</td>\n",
" <td>6.00</td>\n",
" <td>6.00</td>\n",
" <td>6.00</td>\n",
" <td>6.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>10.00</td>\n",
" <td>11.00</td>\n",
" <td>12.00</td>\n",
" <td>13.00</td>\n",
" <td>11.50</td>\n",
" <td>257.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>7.48</td>\n",
" <td>7.48</td>\n",
" <td>7.48</td>\n",
" <td>7.48</td>\n",
" <td>5.61</td>\n",
" <td>187.08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.00</td>\n",
" <td>1.00</td>\n",
" <td>2.00</td>\n",
" <td>3.00</td>\n",
" <td>4.00</td>\n",
" <td>7.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>5.00</td>\n",
" <td>6.00</td>\n",
" <td>7.00</td>\n",
" <td>8.00</td>\n",
" <td>7.75</td>\n",
" <td>132.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>10.00</td>\n",
" <td>11.00</td>\n",
" <td>12.00</td>\n",
" <td>13.00</td>\n",
" <td>11.50</td>\n",
" <td>257.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>15.00</td>\n",
" <td>16.00</td>\n",
" <td>17.00</td>\n",
" <td>18.00</td>\n",
" <td>15.25</td>\n",
" <td>382.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>20.00</td>\n",
" <td>21.00</td>\n",
" <td>22.00</td>\n",
" <td>23.00</td>\n",
" <td>19.00</td>\n",
" <td>507.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D Extra Data G\n",
"count 6.00 6.00 6.00 6.00 6.00 6.00\n",
"mean 10.00 11.00 12.00 13.00 11.50 257.00\n",
"std 7.48 7.48 7.48 7.48 5.61 187.08\n",
"min 0.00 1.00 2.00 3.00 4.00 7.00\n",
"25% 5.00 6.00 7.00 8.00 7.75 132.00\n",
"50% 10.00 11.00 12.00 13.00 11.50 257.00\n",
"75% 15.00 16.00 17.00 18.00 15.25 382.00\n",
"max 20.00 21.00 22.00 23.00 19.00 507.00"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.set_option('display.precision', 2)\n",
"sample_df_2.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Find each column mean"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"A 10.0\n",
"B 11.0\n",
"C 12.0\n",
"D 13.0\n",
"Extra Data 11.5\n",
"G 257.0\n",
"dtype: float64"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_df_2.mean()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"A 10.0\n",
"B 11.0\n",
"C 12.0\n",
"D 13.0\n",
"Extra Data 11.5\n",
"G 257.0\n",
"dtype: float64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_df_2.mean(0) # same as above but we explicitly define the axis as 0 for column"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Find each row mean\n",
"documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.mean.html"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2016-07-01 2.83\n",
"2016-07-02 22.67\n",
"2016-07-03 42.50\n",
"2016-07-04 62.33\n",
"2016-07-05 82.17\n",
"2016-07-06 102.00\n",
"Freq: D, dtype: float64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_df_2.mean(1) # 1 indicate that find mean on row level i.e. axis = 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### apply (a function to a data frame)\n",
"documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>Fruits</th>\n",
" <th>Extra Data</th>\n",
" <th>G</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2016-07-01</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>apple</td>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-02</th>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>8</td>\n",
" <td>10</td>\n",
" <td>appleorange</td>\n",
" <td>11</td>\n",
" <td>114</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-03</th>\n",
" <td>12</td>\n",
" <td>15</td>\n",
" <td>18</td>\n",
" <td>21</td>\n",
" <td>appleorangebanana</td>\n",
" <td>21</td>\n",
" <td>321</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-04</th>\n",
" <td>24</td>\n",
" <td>28</td>\n",
" <td>32</td>\n",
" <td>36</td>\n",
" <td>appleorangebananastrawberry</td>\n",
" <td>34</td>\n",
" <td>628</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-05</th>\n",
" <td>40</td>\n",
" <td>45</td>\n",
" <td>50</td>\n",
" <td>55</td>\n",
" <td>appleorangebananastrawberryblueberry</td>\n",
" <td>50</td>\n",
" <td>1035</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-06</th>\n",
" <td>60</td>\n",
" <td>66</td>\n",
" <td>72</td>\n",
" <td>78</td>\n",
" <td>appleorangebananastrawberryblueberrypineapple</td>\n",
" <td>69</td>\n",
" <td>1542</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D Fruits \\\n",
"2016-07-01 0 1 2 3 apple \n",
"2016-07-02 4 6 8 10 appleorange \n",
"2016-07-03 12 15 18 21 appleorangebanana \n",
"2016-07-04 24 28 32 36 appleorangebananastrawberry \n",
"2016-07-05 40 45 50 55 appleorangebananastrawberryblueberry \n",
"2016-07-06 60 66 72 78 appleorangebananastrawberryblueberrypineapple \n",
"\n",
" Extra Data G \n",
"2016-07-01 4 7 \n",
"2016-07-02 11 114 \n",
"2016-07-03 21 321 \n",
"2016-07-04 34 628 \n",
"2016-07-05 50 1035 \n",
"2016-07-06 69 1542 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_df_2.apply(np.cumsum, axis=0) # perform cumilative sum on column basis "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### string methods\n",
"documentation: http://pandas.pydata.org/pandas-docs/stable/text.html"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 a\n",
"1 b\n",
"2 c\n",
"3 aaba\n",
"4 baca\n",
"5 NaN\n",
"6 caba\n",
"7 dog\n",
"8 cat\n",
"dtype: object"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])\n",
"s.str.lower()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 1.0\n",
"2 1.0\n",
"3 4.0\n",
"4 4.0\n",
"5 NaN\n",
"6 4.0\n",
"7 3.0\n",
"8 3.0\n",
"dtype: float64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s.str.len() # gives string length of each element in the series array"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment