Skip to content

Instantly share code, notes, and snippets.

@dharmeshdev19
Last active September 20, 2018 10:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dharmeshdev19/2212326a99f7b2e5114a0eaeb6a95305 to your computer and use it in GitHub Desktop.
Save dharmeshdev19/2212326a99f7b2e5114a0eaeb6a95305 to your computer and use it in GitHub Desktop.
Pandas / 02 - Learning Pandas / 06 - Merge Operations in Pandas
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Merge\n",
"- Concat \n",
"- Join \n",
"- Append "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>Fruits</th>\n",
" <th>Extra Data</th>\n",
" <th>G</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2016-07-01</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>apple</td>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-02</th>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>orange</td>\n",
" <td>7</td>\n",
" <td>107</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-03</th>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>10</td>\n",
" <td>11</td>\n",
" <td>banana</td>\n",
" <td>10</td>\n",
" <td>207</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-04</th>\n",
" <td>12</td>\n",
" <td>13</td>\n",
" <td>14</td>\n",
" <td>15</td>\n",
" <td>strawberry</td>\n",
" <td>13</td>\n",
" <td>307</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-05</th>\n",
" <td>16</td>\n",
" <td>17</td>\n",
" <td>18</td>\n",
" <td>19</td>\n",
" <td>blueberry</td>\n",
" <td>16</td>\n",
" <td>407</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-06</th>\n",
" <td>20</td>\n",
" <td>21</td>\n",
" <td>22</td>\n",
" <td>23</td>\n",
" <td>pineapple</td>\n",
" <td>19</td>\n",
" <td>507</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D Fruits Extra Data G\n",
"2016-07-01 0 1 2 3 apple 4 7\n",
"2016-07-02 4 5 6 7 orange 7 107\n",
"2016-07-03 8 9 10 11 banana 10 207\n",
"2016-07-04 12 13 14 15 strawberry 13 307\n",
"2016-07-05 16 17 18 19 blueberry 16 407\n",
"2016-07-06 20 21 22 23 pineapple 19 507"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"starting_date = '20160701'\n",
"sample_numpy_data = np.array(np.arange(24)).reshape((6,4))\n",
"dates_index = pd.date_range(starting_date, periods=6)\n",
"sample_df = pd.DataFrame(sample_numpy_data, index=dates_index, columns=list('ABCD'))\n",
"\n",
"sample_df_2 = sample_df.copy()\n",
"sample_df_2['Fruits'] = ['apple', 'orange','banana','strawberry','blueberry','pineapple']\n",
"\n",
"sample_series = pd.Series([1,2,3,4,5,6], index=pd.date_range(starting_date, periods=6))\n",
"sample_df_2['Extra Data'] = sample_series *3 +1\n",
"\n",
"second_numpy_array = np.array(np.arange(len(sample_df_2))) *100 + 7\n",
"sample_df_2['G'] = second_numpy_array\n",
"\n",
"sample_df_2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### concat()\n",
"documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html\n",
"##### separate data frame into a list with 3 elements"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[ A B C D Fruits Extra Data G\n",
" 2016-07-01 0 1 2 3 apple 4 7\n",
" 2016-07-02 4 5 6 7 orange 7 107,\n",
" A B C D Fruits Extra Data G\n",
" 2016-07-03 8 9 10 11 banana 10 207\n",
" 2016-07-04 12 13 14 15 strawberry 13 307,\n",
" A B C D Fruits Extra Data G\n",
" 2016-07-05 16 17 18 19 blueberry 16 407\n",
" 2016-07-06 20 21 22 23 pineapple 19 507]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pieces = [sample_df_2[:2], sample_df_2[2:4], sample_df_2[4:]]\n",
"pieces"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### concatenate first and last elements"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>Fruits</th>\n",
" <th>Extra Data</th>\n",
" <th>G</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2016-07-01</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>apple</td>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-02</th>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>orange</td>\n",
" <td>7</td>\n",
" <td>107</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-05</th>\n",
" <td>16</td>\n",
" <td>17</td>\n",
" <td>18</td>\n",
" <td>19</td>\n",
" <td>blueberry</td>\n",
" <td>16</td>\n",
" <td>407</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-06</th>\n",
" <td>20</td>\n",
" <td>21</td>\n",
" <td>22</td>\n",
" <td>23</td>\n",
" <td>pineapple</td>\n",
" <td>19</td>\n",
" <td>507</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D Fruits Extra Data G\n",
"2016-07-01 0 1 2 3 apple 4 7\n",
"2016-07-02 4 5 6 7 orange 7 107\n",
"2016-07-05 16 17 18 19 blueberry 16 407\n",
"2016-07-06 20 21 22 23 pineapple 19 507"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_list = pieces[0], pieces[2]\n",
"pd.concat(new_list)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### append()\n",
"documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.append.html"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"new_last_row = sample_df_2.iloc[2]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>Fruits</th>\n",
" <th>Extra Data</th>\n",
" <th>G</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2016-07-01</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>apple</td>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-02</th>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>orange</td>\n",
" <td>7</td>\n",
" <td>107</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-03</th>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>10</td>\n",
" <td>11</td>\n",
" <td>banana</td>\n",
" <td>10</td>\n",
" <td>207</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-04</th>\n",
" <td>12</td>\n",
" <td>13</td>\n",
" <td>14</td>\n",
" <td>15</td>\n",
" <td>strawberry</td>\n",
" <td>13</td>\n",
" <td>307</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-05</th>\n",
" <td>16</td>\n",
" <td>17</td>\n",
" <td>18</td>\n",
" <td>19</td>\n",
" <td>blueberry</td>\n",
" <td>16</td>\n",
" <td>407</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-06</th>\n",
" <td>20</td>\n",
" <td>21</td>\n",
" <td>22</td>\n",
" <td>23</td>\n",
" <td>pineapple</td>\n",
" <td>19</td>\n",
" <td>507</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-03</th>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>10</td>\n",
" <td>11</td>\n",
" <td>banana</td>\n",
" <td>10</td>\n",
" <td>207</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D Fruits Extra Data G\n",
"2016-07-01 0 1 2 3 apple 4 7\n",
"2016-07-02 4 5 6 7 orange 7 107\n",
"2016-07-03 8 9 10 11 banana 10 207\n",
"2016-07-04 12 13 14 15 strawberry 13 307\n",
"2016-07-05 16 17 18 19 blueberry 16 407\n",
"2016-07-06 20 21 22 23 pineapple 19 507\n",
"2016-07-03 8 9 10 11 banana 10 207"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_df_2.append(new_last_row) # it returns a new dataframe with the appended value"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>Fruits</th>\n",
" <th>Extra Data</th>\n",
" <th>G</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2016-07-01</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>apple</td>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-02</th>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>orange</td>\n",
" <td>7</td>\n",
" <td>107</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-03</th>\n",
" <td>8</td>\n",
" <td>9</td>\n",
" <td>10</td>\n",
" <td>11</td>\n",
" <td>banana</td>\n",
" <td>10</td>\n",
" <td>207</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-04</th>\n",
" <td>12</td>\n",
" <td>13</td>\n",
" <td>14</td>\n",
" <td>15</td>\n",
" <td>strawberry</td>\n",
" <td>13</td>\n",
" <td>307</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-05</th>\n",
" <td>16</td>\n",
" <td>17</td>\n",
" <td>18</td>\n",
" <td>19</td>\n",
" <td>blueberry</td>\n",
" <td>16</td>\n",
" <td>407</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-07-06</th>\n",
" <td>20</td>\n",
" <td>21</td>\n",
" <td>22</td>\n",
" <td>23</td>\n",
" <td>pineapple</td>\n",
" <td>19</td>\n",
" <td>507</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D Fruits Extra Data G\n",
"2016-07-01 0 1 2 3 apple 4 7\n",
"2016-07-02 4 5 6 7 orange 7 107\n",
"2016-07-03 8 9 10 11 banana 10 207\n",
"2016-07-04 12 13 14 15 strawberry 13 307\n",
"2016-07-05 16 17 18 19 blueberry 16 407\n",
"2016-07-06 20 21 22 23 pineapple 19 507"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_df_2 # original dataframe object is unchanged"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### merge()\n",
"documentation: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html\n",
" \n",
"Merge DataFrame objects by performing a database-style join operation by columns or indexes.\n",
"\n",
"If joining columns on columns, the DataFrame indexes will be ignored. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"left = pd.DataFrame({'my_key': ['K0', 'K1', 'K2', 'K3'],\n",
" 'A': ['A0', 'A1', 'A2', 'A3'],\n",
" 'B': ['B0', 'B1', 'B2', 'B3']})\n",
"right = pd.DataFrame({'my_key': ['K0', 'K1', 'K2', 'K3'],\n",
" 'C': ['C0', 'C1', 'C2', 'C3'],\n",
" 'D': ['D0', 'D1', 'D2', 'D3']})"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>my_key</th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>K0</td>\n",
" <td>A0</td>\n",
" <td>B0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>K1</td>\n",
" <td>A1</td>\n",
" <td>B1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>K2</td>\n",
" <td>A2</td>\n",
" <td>B2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>K3</td>\n",
" <td>A3</td>\n",
" <td>B3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" my_key A B\n",
"0 K0 A0 B0\n",
"1 K1 A1 B1\n",
"2 K2 A2 B2\n",
"3 K3 A3 B3"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>my_key</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>K0</td>\n",
" <td>C0</td>\n",
" <td>D0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>K1</td>\n",
" <td>C1</td>\n",
" <td>D1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>K2</td>\n",
" <td>C2</td>\n",
" <td>D2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>K3</td>\n",
" <td>C3</td>\n",
" <td>D3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" my_key C D\n",
"0 K0 C0 D0\n",
"1 K1 C1 D1\n",
"2 K2 C2 D2\n",
"3 K3 C3 D3"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"right"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>my_key</th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>K0</td>\n",
" <td>A0</td>\n",
" <td>B0</td>\n",
" <td>C0</td>\n",
" <td>D0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>K1</td>\n",
" <td>A1</td>\n",
" <td>B1</td>\n",
" <td>C1</td>\n",
" <td>D1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>K2</td>\n",
" <td>A2</td>\n",
" <td>B2</td>\n",
" <td>C2</td>\n",
" <td>D2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>K3</td>\n",
" <td>A3</td>\n",
" <td>B3</td>\n",
" <td>C3</td>\n",
" <td>D3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" my_key A B C D\n",
"0 K0 A0 B0 C0 D0\n",
"1 K1 A1 B1 C1 D1\n",
"2 K2 A2 B2 C2 D2\n",
"3 K3 A3 B3 C3 D3"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result = pd.merge(left, right, on='my_key')\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment