Last active
July 10, 2019 18:30
-
-
Save pybokeh/738827fd0da1df378b9ed3c3d8072b10 to your computer and use it in GitHub Desktop.
pandas/TipsNTricks.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Creating date column based on other columns" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "**SITUATION:** You have dates broken up in 3 columns and need to create an actual column with dates." | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "import pandas as pd", | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.read_clipboard()", | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df", | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 3, | |
"data": { | |
"text/plain": " MONTH DAY YEAR\n0 7 1 2019\n1 7 2 2019\n2 7 3 2019\n3 7 4 2019\n4 7 5 2019\n5 7 6 2019\n6 7 7 2019\n7 7 8 2019\n8 7 9 2019\n9 7 10 2019", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>MONTH</th>\n <th>DAY</th>\n <th>YEAR</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>7</td>\n <td>1</td>\n <td>2019</td>\n </tr>\n <tr>\n <th>1</th>\n <td>7</td>\n <td>2</td>\n <td>2019</td>\n </tr>\n <tr>\n <th>2</th>\n <td>7</td>\n <td>3</td>\n <td>2019</td>\n </tr>\n <tr>\n <th>3</th>\n <td>7</td>\n <td>4</td>\n <td>2019</td>\n </tr>\n <tr>\n <th>4</th>\n <td>7</td>\n <td>5</td>\n <td>2019</td>\n </tr>\n <tr>\n <th>5</th>\n <td>7</td>\n <td>6</td>\n <td>2019</td>\n </tr>\n <tr>\n <th>6</th>\n <td>7</td>\n <td>7</td>\n <td>2019</td>\n </tr>\n <tr>\n <th>7</th>\n <td>7</td>\n <td>8</td>\n <td>2019</td>\n </tr>\n <tr>\n <th>8</th>\n <td>7</td>\n <td>9</td>\n <td>2019</td>\n </tr>\n <tr>\n <th>9</th>\n <td>7</td>\n <td>10</td>\n <td>2019</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df['DATE'] = pd.to_datetime(df[['MONTH', 'DAY', 'YEAR']])", | |
"execution_count": 5, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df", | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 6, | |
"data": { | |
"text/plain": " MONTH DAY YEAR DATE\n0 7 1 2019 2019-07-01\n1 7 2 2019 2019-07-02\n2 7 3 2019 2019-07-03\n3 7 4 2019 2019-07-04\n4 7 5 2019 2019-07-05\n5 7 6 2019 2019-07-06\n6 7 7 2019 2019-07-07\n7 7 8 2019 2019-07-08\n8 7 9 2019 2019-07-09\n9 7 10 2019 2019-07-10", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>MONTH</th>\n <th>DAY</th>\n <th>YEAR</th>\n <th>DATE</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>7</td>\n <td>1</td>\n <td>2019</td>\n <td>2019-07-01</td>\n </tr>\n <tr>\n <th>1</th>\n <td>7</td>\n <td>2</td>\n <td>2019</td>\n <td>2019-07-02</td>\n </tr>\n <tr>\n <th>2</th>\n <td>7</td>\n <td>3</td>\n <td>2019</td>\n <td>2019-07-03</td>\n </tr>\n <tr>\n <th>3</th>\n <td>7</td>\n <td>4</td>\n <td>2019</td>\n <td>2019-07-04</td>\n </tr>\n <tr>\n <th>4</th>\n <td>7</td>\n <td>5</td>\n <td>2019</td>\n <td>2019-07-05</td>\n </tr>\n <tr>\n <th>5</th>\n <td>7</td>\n <td>6</td>\n <td>2019</td>\n <td>2019-07-06</td>\n </tr>\n <tr>\n <th>6</th>\n <td>7</td>\n <td>7</td>\n <td>2019</td>\n <td>2019-07-07</td>\n </tr>\n <tr>\n <th>7</th>\n <td>7</td>\n <td>8</td>\n <td>2019</td>\n <td>2019-07-08</td>\n </tr>\n <tr>\n <th>8</th>\n <td>7</td>\n <td>9</td>\n <td>2019</td>\n <td>2019-07-09</td>\n </tr>\n <tr>\n <th>9</th>\n <td>7</td>\n <td>10</td>\n <td>2019</td>\n <td>2019-07-10</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df.dtypes", | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 7, | |
"data": { | |
"text/plain": "MONTH int64\nDAY int64\nYEAR int64\nDATE datetime64[ns]\ndtype: object" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Combining string values from multiple columns into a single column" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "**SITUATION:** You have a first name column and a last name column. But you need a column that is of the format: last name, first name" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.read_clipboard()\ndf", | |
"execution_count": 69, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 69, | |
"data": { | |
"text/plain": " ID FirstName LastName\n0 1 John Smith\n1 2 Jane Doe\n2 3 Someone Else", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>FirstName</th>\n <th>LastName</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>John</td>\n <td>Smith</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Jane</td>\n <td>Doe</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>Someone</td>\n <td>Else</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df['LastFirstName'] = df['LastName'] + ', ' + df['FirstName']\ndf", | |
"execution_count": 70, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 70, | |
"data": { | |
"text/plain": " ID FirstName LastName LastFirstName\n0 1 John Smith Smith, John\n1 2 Jane Doe Doe, Jane\n2 3 Someone Else Else, Someone", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>FirstName</th>\n <th>LastName</th>\n <th>LastFirstName</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>John</td>\n <td>Smith</td>\n <td>Smith, John</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Jane</td>\n <td>Doe</td>\n <td>Doe, Jane</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>Someone</td>\n <td>Else</td>\n <td>Else, Someone</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Split up string into multiple columns" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "**SITUATION:** You have a column with string values, but need the string value broken up into multiple columns." | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.read_clipboard()", | |
"execution_count": 9, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df", | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 10, | |
"data": { | |
"text/plain": " ID LastFirstName\n0 1 John Smith\n1 2 Jane Doe\n2 3 Someone Else", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>LastFirstName</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>John Smith</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Jane Doe</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>Someone Else</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df[['FirstName', 'LastName']] = df['LastFirstName'].str.split(' ', expand=True)", | |
"execution_count": 11, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df", | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 12, | |
"data": { | |
"text/plain": " ID LastFirstName FirstName LastName\n0 1 John Smith John Smith\n1 2 Jane Doe Jane Doe\n2 3 Someone Else Someone Else", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>LastFirstName</th>\n <th>FirstName</th>\n <th>LastName</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>John Smith</td>\n <td>John</td>\n <td>Smith</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Jane Doe</td>\n <td>Jane</td>\n <td>Doe</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>Someone Else</td>\n <td>Someone</td>\n <td>Else</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "#### You don't have to consume all the split values into multiple columns, you can choose to keep the first split value or second value using indexing ( [0] or [1] ):" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.read_clipboard()\ndf", | |
"execution_count": 33, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 33, | |
"data": { | |
"text/plain": " ID LOCATION\n0 1 Los Angeles, CA\n1 2 Washington, D.C.\n2 3 Columbus, OH", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>LOCATION</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Los Angeles, CA</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Washington, D.C.</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>Columbus, OH</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df['CITY'] = df['LOCATION'].str.split(', ', expand=True)[0]", | |
"execution_count": 35, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df", | |
"execution_count": 36, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 36, | |
"data": { | |
"text/plain": " ID LOCATION CITY\n0 1 Los Angeles, CA Los Angeles\n1 2 Washington, D.C. Washington\n2 3 Columbus, OH Columbus", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>LOCATION</th>\n <th>CITY</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Los Angeles, CA</td>\n <td>Los Angeles</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Washington, D.C.</td>\n <td>Washington</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>Columbus, OH</td>\n <td>Columbus</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df['STATE'] = df['LOCATION'].str.split(', ', expand=True)[1]", | |
"execution_count": 38, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df", | |
"execution_count": 39, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 39, | |
"data": { | |
"text/plain": " ID LOCATION CITY STATE\n0 1 Los Angeles, CA Los Angeles CA\n1 2 Washington, D.C. Washington D.C.\n2 3 Columbus, OH Columbus OH", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>LOCATION</th>\n <th>CITY</th>\n <th>STATE</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>Los Angeles, CA</td>\n <td>Los Angeles</td>\n <td>CA</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>Washington, D.C.</td>\n <td>Washington</td>\n <td>D.C.</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>Columbus, OH</td>\n <td>Columbus</td>\n <td>OH</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Clean columns of a specific data type" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "**SITUATION:** There are hashtag characters in your string columns. You could have several string columns. We need to remove them. There are 3 different ways depending on level of control." | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.read_clipboard()", | |
"execution_count": 15, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df", | |
"execution_count": 16, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 16, | |
"data": { | |
"text/plain": " str_BadData1 Numeric1 Numeric2 str_BadData2 str_BadData3\n0 #Hello# 1 4 #Steve# #Smith#\n1 #GoodBye# 2 5 #Jane# #Doe#\n2 #Welcome# 3 6 #John# #Hicks#", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>str_BadData1</th>\n <th>Numeric1</th>\n <th>Numeric2</th>\n <th>str_BadData2</th>\n <th>str_BadData3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>#Hello#</td>\n <td>1</td>\n <td>4</td>\n <td>#Steve#</td>\n <td>#Smith#</td>\n </tr>\n <tr>\n <th>1</th>\n <td>#GoodBye#</td>\n <td>2</td>\n <td>5</td>\n <td>#Jane#</td>\n <td>#Doe#</td>\n </tr>\n <tr>\n <th>2</th>\n <td>#Welcome#</td>\n <td>3</td>\n <td>6</td>\n <td>#John#</td>\n <td>#Hicks#</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Method 1: Using ```isinstance()``` to check for data type:" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "def remove_hashtag1(df):\n for column in df.columns:\n # if first value of column is of type str, then remove the hashtags\n if isinstance(df[column][0], str):\n df[column] = df[column].str.replace('#', '')", | |
"execution_count": 17, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "remove_hashtag1(df)", | |
"execution_count": 18, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df", | |
"execution_count": 19, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 19, | |
"data": { | |
"text/plain": " str_BadData1 Numeric1 Numeric2 str_BadData2 str_BadData3\n0 Hello 1 4 Steve Smith\n1 GoodBye 2 5 Jane Doe\n2 Welcome 3 6 John Hicks", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>str_BadData1</th>\n <th>Numeric1</th>\n <th>Numeric2</th>\n <th>str_BadData2</th>\n <th>str_BadData3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Hello</td>\n <td>1</td>\n <td>4</td>\n <td>Steve</td>\n <td>Smith</td>\n </tr>\n <tr>\n <th>1</th>\n <td>GoodBye</td>\n <td>2</td>\n <td>5</td>\n <td>Jane</td>\n <td>Doe</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Welcome</td>\n <td>3</td>\n <td>6</td>\n <td>John</td>\n <td>Hicks</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Method 2: Using ```dtype()``` to check for data type:" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "def remove_hashtag2(df):\n for column in df.columns:\n # if column is of type 'object', then remove the hashtags\n if df[column].dtype == 'object':\n df[column] = df[column].str.replace('#', '')", | |
"execution_count": 22, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.read_clipboard()\ndf", | |
"execution_count": 20, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 20, | |
"data": { | |
"text/plain": " str_BadData1 Numeric1 Numeric2 str_BadData2 str_BadData3\n0 #Hello# 1 4 #Steve# #Smith#\n1 #GoodBye# 2 5 #Jane# #Doe#\n2 #Welcome# 3 6 #John# #Hicks#", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>str_BadData1</th>\n <th>Numeric1</th>\n <th>Numeric2</th>\n <th>str_BadData2</th>\n <th>str_BadData3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>#Hello#</td>\n <td>1</td>\n <td>4</td>\n <td>#Steve#</td>\n <td>#Smith#</td>\n </tr>\n <tr>\n <th>1</th>\n <td>#GoodBye#</td>\n <td>2</td>\n <td>5</td>\n <td>#Jane#</td>\n <td>#Doe#</td>\n </tr>\n <tr>\n <th>2</th>\n <td>#Welcome#</td>\n <td>3</td>\n <td>6</td>\n <td>#John#</td>\n <td>#Hicks#</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "remove_hashtag2(df)", | |
"execution_count": 23, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df", | |
"execution_count": 24, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 24, | |
"data": { | |
"text/plain": " str_BadData1 Numeric1 Numeric2 str_BadData2 str_BadData3\n0 Hello 1 4 Steve Smith\n1 GoodBye 2 5 Jane Doe\n2 Welcome 3 6 John Hicks", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>str_BadData1</th>\n <th>Numeric1</th>\n <th>Numeric2</th>\n <th>str_BadData2</th>\n <th>str_BadData3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Hello</td>\n <td>1</td>\n <td>4</td>\n <td>Steve</td>\n <td>Smith</td>\n </tr>\n <tr>\n <th>1</th>\n <td>GoodBye</td>\n <td>2</td>\n <td>5</td>\n <td>Jane</td>\n <td>Doe</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Welcome</td>\n <td>3</td>\n <td>6</td>\n <td>John</td>\n <td>Hicks</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Method 3: Using ```is_string_dtype``` to detect column's data type:" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "from pandas.api.types import is_string_dtype", | |
"execution_count": 25, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.read_clipboard()\ndf", | |
"execution_count": 26, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 26, | |
"data": { | |
"text/plain": " str_BadData1 Numeric1 Numeric2 str_BadData2 str_BadData3\n0 #Hello# 1 4 #Steve# #Smith#\n1 #GoodBye# 2 5 #Jane# #Doe#\n2 #Welcome# 3 6 #John# #Hicks#", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>str_BadData1</th>\n <th>Numeric1</th>\n <th>Numeric2</th>\n <th>str_BadData2</th>\n <th>str_BadData3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>#Hello#</td>\n <td>1</td>\n <td>4</td>\n <td>#Steve#</td>\n <td>#Smith#</td>\n </tr>\n <tr>\n <th>1</th>\n <td>#GoodBye#</td>\n <td>2</td>\n <td>5</td>\n <td>#Jane#</td>\n <td>#Doe#</td>\n </tr>\n <tr>\n <th>2</th>\n <td>#Welcome#</td>\n <td>3</td>\n <td>6</td>\n <td>#John#</td>\n <td>#Hicks#</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "def remove_hashtag3(df):\n for column in df.columns:\n # if column is of a string data type, then strip the white space\n if is_string_dtype(df[column]):\n df[column] = df[column].str.replace('#', '')", | |
"execution_count": 29, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "remove_hashtag3(df)", | |
"execution_count": 30, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df", | |
"execution_count": 31, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 31, | |
"data": { | |
"text/plain": " str_BadData1 Numeric1 Numeric2 str_BadData2 str_BadData3\n0 Hello 1 4 Steve Smith\n1 GoodBye 2 5 Jane Doe\n2 Welcome 3 6 John Hicks", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>str_BadData1</th>\n <th>Numeric1</th>\n <th>Numeric2</th>\n <th>str_BadData2</th>\n <th>str_BadData3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Hello</td>\n <td>1</td>\n <td>4</td>\n <td>Steve</td>\n <td>Smith</td>\n </tr>\n <tr>\n <th>1</th>\n <td>GoodBye</td>\n <td>2</td>\n <td>5</td>\n <td>Jane</td>\n <td>Doe</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Welcome</td>\n <td>3</td>\n <td>6</td>\n <td>John</td>\n <td>Hicks</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Select Columns by Data Type" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "**SITUATION:** You need to isolate numeric columns from string columns or vice versa" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.read_clipboard()\ndf", | |
"execution_count": 52, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 52, | |
"data": { | |
"text/plain": " str_BadData1 Numeric1 Numeric2 str_BadData2 str_BadData3\n0 #Hello# 1 4 #Steve# #Smith#\n1 #GoodBye# 2 5 #Jane# #Doe#\n2 #Welcome# 3 6 #John# #Hicks#", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>str_BadData1</th>\n <th>Numeric1</th>\n <th>Numeric2</th>\n <th>str_BadData2</th>\n <th>str_BadData3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>#Hello#</td>\n <td>1</td>\n <td>4</td>\n <td>#Steve#</td>\n <td>#Smith#</td>\n </tr>\n <tr>\n <th>1</th>\n <td>#GoodBye#</td>\n <td>2</td>\n <td>5</td>\n <td>#Jane#</td>\n <td>#Doe#</td>\n </tr>\n <tr>\n <th>2</th>\n <td>#Welcome#</td>\n <td>3</td>\n <td>6</td>\n <td>#John#</td>\n <td>#Hicks#</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Get count of columns by data type:" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df.dtypes.value_counts()", | |
"execution_count": 55, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 55, | |
"data": { | |
"text/plain": "object 3\nint64 2\ndtype: int64" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df_num = df.select_dtypes(include='number')\ndf_num", | |
"execution_count": 41, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 41, | |
"data": { | |
"text/plain": " Numeric1 Numeric2\n0 1 4\n1 2 5\n2 3 6", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Numeric1</th>\n <th>Numeric2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>4</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>5</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>6</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df_str = df.select_dtypes(include='object')\ndf_str", | |
"execution_count": 42, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 42, | |
"data": { | |
"text/plain": " str_BadData1 str_BadData2 str_BadData3\n0 #Hello# #Steve# #Smith#\n1 #GoodBye# #Jane# #Doe#\n2 #Welcome# #John# #Hicks#", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>str_BadData1</th>\n <th>str_BadData2</th>\n <th>str_BadData3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>#Hello#</td>\n <td>#Steve#</td>\n <td>#Smith#</td>\n </tr>\n <tr>\n <th>1</th>\n <td>#GoodBye#</td>\n <td>#Jane#</td>\n <td>#Doe#</td>\n </tr>\n <tr>\n <th>2</th>\n <td>#Welcome#</td>\n <td>#John#</td>\n <td>#Hicks#</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Adding prefix or suffix to your column names" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.read_clipboard()\ndf", | |
"execution_count": 50, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 50, | |
"data": { | |
"text/plain": " str_BadData1 Numeric1 Numeric2 str_BadData2 str_BadData3\n0 #Hello# 1 4 #Steve# #Smith#\n1 #GoodBye# 2 5 #Jane# #Doe#\n2 #Welcome# 3 6 #John# #Hicks#", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>str_BadData1</th>\n <th>Numeric1</th>\n <th>Numeric2</th>\n <th>str_BadData2</th>\n <th>str_BadData3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>#Hello#</td>\n <td>1</td>\n <td>4</td>\n <td>#Steve#</td>\n <td>#Smith#</td>\n </tr>\n <tr>\n <th>1</th>\n <td>#GoodBye#</td>\n <td>2</td>\n <td>5</td>\n <td>#Jane#</td>\n <td>#Doe#</td>\n </tr>\n <tr>\n <th>2</th>\n <td>#Welcome#</td>\n <td>3</td>\n <td>6</td>\n <td>#John#</td>\n <td>#Hicks#</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = df.add_prefix('BEGIN_')\ndf", | |
"execution_count": 51, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 51, | |
"data": { | |
"text/plain": " BEGIN_str_BadData1 BEGIN_Numeric1 BEGIN_Numeric2 BEGIN_str_BadData2 \\\n0 #Hello# 1 4 #Steve# \n1 #GoodBye# 2 5 #Jane# \n2 #Welcome# 3 6 #John# \n\n BEGIN_str_BadData3 \n0 #Smith# \n1 #Doe# \n2 #Hicks# ", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>BEGIN_str_BadData1</th>\n <th>BEGIN_Numeric1</th>\n <th>BEGIN_Numeric2</th>\n <th>BEGIN_str_BadData2</th>\n <th>BEGIN_str_BadData3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>#Hello#</td>\n <td>1</td>\n <td>4</td>\n <td>#Steve#</td>\n <td>#Smith#</td>\n </tr>\n <tr>\n <th>1</th>\n <td>#GoodBye#</td>\n <td>2</td>\n <td>5</td>\n <td>#Jane#</td>\n <td>#Doe#</td>\n </tr>\n <tr>\n <th>2</th>\n <td>#Welcome#</td>\n <td>3</td>\n <td>6</td>\n <td>#John#</td>\n <td>#Hicks#</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = df.add_suffix('_END')\ndf", | |
"execution_count": 49, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 49, | |
"data": { | |
"text/plain": " str_BadData1_END Numeric1_END Numeric2_END str_BadData2_END \\\n0 #Hello# 1 4 #Steve# \n1 #GoodBye# 2 5 #Jane# \n2 #Welcome# 3 6 #John# \n\n str_BadData3_END \n0 #Smith# \n1 #Doe# \n2 #Hicks# ", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>str_BadData1_END</th>\n <th>Numeric1_END</th>\n <th>Numeric2_END</th>\n <th>str_BadData2_END</th>\n <th>str_BadData3_END</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>#Hello#</td>\n <td>1</td>\n <td>4</td>\n <td>#Steve#</td>\n <td>#Smith#</td>\n </tr>\n <tr>\n <th>1</th>\n <td>#GoodBye#</td>\n <td>2</td>\n <td>5</td>\n <td>#Jane#</td>\n <td>#Doe#</td>\n </tr>\n <tr>\n <th>2</th>\n <td>#Welcome#</td>\n <td>3</td>\n <td>6</td>\n <td>#John#</td>\n <td>#Hicks#</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Finding duplicate records" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.read_clipboard()\ndf", | |
"execution_count": 56, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 56, | |
"data": { | |
"text/plain": " K1 K2\n0 one 1\n1 one 2\n2 one 3\n3 two 3\n4 two 3\n5 two 4\n6 two 4", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>K1</th>\n <th>K2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>one</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>one</td>\n <td>2</td>\n </tr>\n <tr>\n <th>2</th>\n <td>one</td>\n <td>3</td>\n </tr>\n <tr>\n <th>3</th>\n <td>two</td>\n <td>3</td>\n </tr>\n <tr>\n <th>4</th>\n <td>two</td>\n <td>3</td>\n </tr>\n <tr>\n <th>5</th>\n <td>two</td>\n <td>4</td>\n </tr>\n <tr>\n <th>6</th>\n <td>two</td>\n <td>4</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "duplicate_records = df[df.duplicated()]\nduplicate_records", | |
"execution_count": 57, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 57, | |
"data": { | |
"text/plain": " K1 K2\n4 two 3\n6 two 4", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>K1</th>\n <th>K2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>4</th>\n <td>two</td>\n <td>3</td>\n </tr>\n <tr>\n <th>6</th>\n <td>two</td>\n <td>4</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Removing duplicate records" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.read_clipboard()\ndf", | |
"execution_count": 58, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 58, | |
"data": { | |
"text/plain": " K1 K2\n0 one 1\n1 one 2\n2 one 3\n3 two 3\n4 two 3\n5 two 4\n6 two 4", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>K1</th>\n <th>K2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>one</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>one</td>\n <td>2</td>\n </tr>\n <tr>\n <th>2</th>\n <td>one</td>\n <td>3</td>\n </tr>\n <tr>\n <th>3</th>\n <td>two</td>\n <td>3</td>\n </tr>\n <tr>\n <th>4</th>\n <td>two</td>\n <td>3</td>\n </tr>\n <tr>\n <th>5</th>\n <td>two</td>\n <td>4</td>\n </tr>\n <tr>\n <th>6</th>\n <td>two</td>\n <td>4</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df.drop_duplicates() # by default, duplicate is defined by all columns", | |
"execution_count": 59, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 59, | |
"data": { | |
"text/plain": " K1 K2\n0 one 1\n1 one 2\n2 one 3\n3 two 3\n5 two 4", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>K1</th>\n <th>K2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>one</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>one</td>\n <td>2</td>\n </tr>\n <tr>\n <th>2</th>\n <td>one</td>\n <td>3</td>\n </tr>\n <tr>\n <th>3</th>\n <td>two</td>\n <td>3</td>\n </tr>\n <tr>\n <th>5</th>\n <td>two</td>\n <td>4</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "or define what is \"duplicate\" by column name(s):" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.read_clipboard()\ndf", | |
"execution_count": 60, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 60, | |
"data": { | |
"text/plain": " K1 K2\n0 one 1\n1 one 2\n2 one 3\n3 two 3\n4 two 3\n5 two 4\n6 two 4", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>K1</th>\n <th>K2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>one</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>one</td>\n <td>2</td>\n </tr>\n <tr>\n <th>2</th>\n <td>one</td>\n <td>3</td>\n </tr>\n <tr>\n <th>3</th>\n <td>two</td>\n <td>3</td>\n </tr>\n <tr>\n <th>4</th>\n <td>two</td>\n <td>3</td>\n </tr>\n <tr>\n <th>5</th>\n <td>two</td>\n <td>4</td>\n </tr>\n <tr>\n <th>6</th>\n <td>two</td>\n <td>4</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df.drop_duplicates(subset='K1') # Record is considered \"duplicated\" based on column K1", | |
"execution_count": 61, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 61, | |
"data": { | |
"text/plain": " K1 K2\n0 one 1\n3 two 3", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>K1</th>\n <th>K2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>one</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>two</td>\n <td>3</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Create a categorical column based on numeric values or a.k.a. assigning numbers to categories based on pre-defined ranges" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "**SITUATION:** You have people's ages, but now you need to assign their age to: 'CHILD', 'ADULT', or 'ELDERLY' category based on their age" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "import numpy as np", | |
"execution_count": 71, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Let's create 50 random integers to represent age whose values are between 1 and 99:" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "random_integers = np.random.randint(1, 99, 50)", | |
"execution_count": 77, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "random_integers", | |
"execution_count": 78, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 78, | |
"data": { | |
"text/plain": "array([29, 20, 46, 89, 47, 23, 65, 49, 66, 31, 6, 65, 90, 30, 92, 63, 31,\n 18, 66, 58, 76, 17, 54, 96, 95, 34, 95, 97, 56, 53, 92, 95, 64, 83,\n 82, 25, 47, 33, 95, 60, 60, 98, 32, 93, 33, 28, 7, 60, 20, 7])" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.DataFrame({'AGE': random_integers})", | |
"execution_count": 79, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df.head(10)", | |
"execution_count": 80, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 80, | |
"data": { | |
"text/plain": " AGE\n0 29\n1 20\n2 46\n3 89\n4 47\n5 23\n6 65\n7 49\n8 66\n9 31", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>AGE</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>29</td>\n </tr>\n <tr>\n <th>1</th>\n <td>20</td>\n </tr>\n <tr>\n <th>2</th>\n <td>46</td>\n </tr>\n <tr>\n <th>3</th>\n <td>89</td>\n </tr>\n <tr>\n <th>4</th>\n <td>47</td>\n </tr>\n <tr>\n <th>5</th>\n <td>23</td>\n </tr>\n <tr>\n <th>6</th>\n <td>65</td>\n </tr>\n <tr>\n <th>7</th>\n <td>49</td>\n </tr>\n <tr>\n <th>8</th>\n <td>66</td>\n </tr>\n <tr>\n <th>9</th>\n <td>31</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df['AGE_GROUP'] = pd.cut(df['AGE'], bins=[0, 18, 65, 99], labels=['CHILD', 'ADULT', 'ELDERLY'])\ndf.head(15)", | |
"execution_count": 81, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 81, | |
"data": { | |
"text/plain": " AGE AGE_GROUP\n0 29 ADULT\n1 20 ADULT\n2 46 ADULT\n3 89 ELDERLY\n4 47 ADULT\n5 23 ADULT\n6 65 ADULT\n7 49 ADULT\n8 66 ELDERLY\n9 31 ADULT\n10 6 CHILD\n11 65 ADULT\n12 90 ELDERLY\n13 30 ADULT\n14 92 ELDERLY", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>AGE</th>\n <th>AGE_GROUP</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>29</td>\n <td>ADULT</td>\n </tr>\n <tr>\n <th>1</th>\n <td>20</td>\n <td>ADULT</td>\n </tr>\n <tr>\n <th>2</th>\n <td>46</td>\n <td>ADULT</td>\n </tr>\n <tr>\n <th>3</th>\n <td>89</td>\n <td>ELDERLY</td>\n </tr>\n <tr>\n <th>4</th>\n <td>47</td>\n <td>ADULT</td>\n </tr>\n <tr>\n <th>5</th>\n <td>23</td>\n <td>ADULT</td>\n </tr>\n <tr>\n <th>6</th>\n <td>65</td>\n <td>ADULT</td>\n </tr>\n <tr>\n <th>7</th>\n <td>49</td>\n <td>ADULT</td>\n </tr>\n <tr>\n <th>8</th>\n <td>66</td>\n <td>ELDERLY</td>\n </tr>\n <tr>\n <th>9</th>\n <td>31</td>\n <td>ADULT</td>\n </tr>\n <tr>\n <th>10</th>\n <td>6</td>\n <td>CHILD</td>\n </tr>\n <tr>\n <th>11</th>\n <td>65</td>\n <td>ADULT</td>\n </tr>\n <tr>\n <th>12</th>\n <td>90</td>\n <td>ELDERLY</td>\n </tr>\n <tr>\n <th>13</th>\n <td>30</td>\n <td>ADULT</td>\n </tr>\n <tr>\n <th>14</th>\n <td>92</td>\n <td>ELDERLY</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"hide_input": false, | |
"varInspector": { | |
"window_display": false, | |
"cols": { | |
"lenName": 16, | |
"lenType": 16, | |
"lenVar": 40 | |
}, | |
"kernels_config": { | |
"python": { | |
"library": "var_list.py", | |
"delete_cmd_prefix": "del ", | |
"delete_cmd_postfix": "", | |
"varRefreshCmd": "print(var_dic_list())" | |
}, | |
"r": { | |
"library": "var_list.r", | |
"delete_cmd_prefix": "rm(", | |
"delete_cmd_postfix": ") ", | |
"varRefreshCmd": "cat(var_dic_list()) " | |
} | |
}, | |
"types_to_exclude": [ | |
"module", | |
"function", | |
"builtin_function_or_method", | |
"instance", | |
"_Feature" | |
] | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.6.5", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"gist": { | |
"id": "738827fd0da1df378b9ed3c3d8072b10", | |
"data": { | |
"description": "pandas/TipsNTricks.ipynb", | |
"public": true | |
} | |
}, | |
"_draft": { | |
"nbviewer_url": "https://gist.github.com/738827fd0da1df378b9ed3c3d8072b10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment