pybokeh/PandasTips.ipynb

## PandasTips.ipynb
{
  "cells": [
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Creating date column based on other columns"
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "**SITUATION:** You have dates broken up in 3 columns and need to create an actual column with dates."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "import pandas as pd",
      "execution_count": 1,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.read_clipboard()",
      "execution_count": 2,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df",
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 3,
          "data": {
            "text/plain": "   MONTH  DAY  YEAR\n0      7    1  2019\n1      7    2  2019\n2      7    3  2019\n3      7    4  2019\n4      7    5  2019\n5      7    6  2019\n6      7    7  2019\n7      7    8  2019\n8      7    9  2019\n9      7   10  2019",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>MONTH</th>\n      <th>DAY</th>\n      <th>YEAR</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>7</td>\n      <td>1</td>\n      <td>2019</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>7</td>\n      <td>2</td>\n      <td>2019</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>7</td>\n      <td>3</td>\n      <td>2019</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>7</td>\n      <td>4</td>\n      <td>2019</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>7</td>\n      <td>5</td>\n      <td>2019</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>7</td>\n      <td>6</td>\n      <td>2019</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>7</td>\n      <td>7</td>\n      <td>2019</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>7</td>\n      <td>8</td>\n      <td>2019</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>7</td>\n      <td>9</td>\n      <td>2019</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>7</td>\n      <td>10</td>\n      <td>2019</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df['DATE'] = pd.to_datetime(df[['MONTH', 'DAY', 'YEAR']])",
      "execution_count": 5,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df",
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 6,
          "data": {
            "text/plain": "   MONTH  DAY  YEAR       DATE\n0      7    1  2019 2019-07-01\n1      7    2  2019 2019-07-02\n2      7    3  2019 2019-07-03\n3      7    4  2019 2019-07-04\n4      7    5  2019 2019-07-05\n5      7    6  2019 2019-07-06\n6      7    7  2019 2019-07-07\n7      7    8  2019 2019-07-08\n8      7    9  2019 2019-07-09\n9      7   10  2019 2019-07-10",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>MONTH</th>\n      <th>DAY</th>\n      <th>YEAR</th>\n      <th>DATE</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>7</td>\n      <td>1</td>\n      <td>2019</td>\n      <td>2019-07-01</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>7</td>\n      <td>2</td>\n      <td>2019</td>\n      <td>2019-07-02</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>7</td>\n      <td>3</td>\n      <td>2019</td>\n      <td>2019-07-03</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>7</td>\n      <td>4</td>\n      <td>2019</td>\n      <td>2019-07-04</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>7</td>\n      <td>5</td>\n      <td>2019</td>\n      <td>2019-07-05</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>7</td>\n      <td>6</td>\n      <td>2019</td>\n      <td>2019-07-06</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>7</td>\n      <td>7</td>\n      <td>2019</td>\n      <td>2019-07-07</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>7</td>\n      <td>8</td>\n      <td>2019</td>\n      <td>2019-07-08</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>7</td>\n      <td>9</td>\n      <td>2019</td>\n      <td>2019-07-09</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>7</td>\n      <td>10</td>\n      <td>2019</td>\n      <td>2019-07-10</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df.dtypes",
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 7,
          "data": {
            "text/plain": "MONTH             int64\nDAY               int64\nYEAR              int64\nDATE     datetime64[ns]\ndtype: object"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Combining string values from multiple columns into a single column"
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "**SITUATION:** You have a first name column and a last name column.  But you need a column that is of the format: last name, first name"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.read_clipboard()\ndf",
      "execution_count": 69,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 69,
          "data": {
            "text/plain": "   ID FirstName LastName\n0   1      John    Smith\n1   2      Jane      Doe\n2   3   Someone     Else",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>FirstName</th>\n      <th>LastName</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>John</td>\n      <td>Smith</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2</td>\n      <td>Jane</td>\n      <td>Doe</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>3</td>\n      <td>Someone</td>\n      <td>Else</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df['LastFirstName'] = df['LastName'] + ', ' + df['FirstName']\ndf",
      "execution_count": 70,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 70,
          "data": {
            "text/plain": "   ID FirstName LastName  LastFirstName\n0   1      John    Smith    Smith, John\n1   2      Jane      Doe      Doe, Jane\n2   3   Someone     Else  Else, Someone",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>FirstName</th>\n      <th>LastName</th>\n      <th>LastFirstName</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>John</td>\n      <td>Smith</td>\n      <td>Smith, John</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2</td>\n      <td>Jane</td>\n      <td>Doe</td>\n      <td>Doe, Jane</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>3</td>\n      <td>Someone</td>\n      <td>Else</td>\n      <td>Else, Someone</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Split up string into multiple columns"
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "**SITUATION:** You have a column with string values, but need the string value broken up into multiple columns."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.read_clipboard()",
      "execution_count": 9,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df",
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 10,
          "data": {
            "text/plain": "   ID LastFirstName\n0   1    John Smith\n1   2      Jane Doe\n2   3  Someone Else",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>LastFirstName</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>John Smith</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2</td>\n      <td>Jane Doe</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>3</td>\n      <td>Someone Else</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df[['FirstName', 'LastName']] = df['LastFirstName'].str.split(' ', expand=True)",
      "execution_count": 11,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df",
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 12,
          "data": {
            "text/plain": "   ID LastFirstName FirstName LastName\n0   1    John Smith      John    Smith\n1   2      Jane Doe      Jane      Doe\n2   3  Someone Else   Someone     Else",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>LastFirstName</th>\n      <th>FirstName</th>\n      <th>LastName</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>John Smith</td>\n      <td>John</td>\n      <td>Smith</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2</td>\n      <td>Jane Doe</td>\n      <td>Jane</td>\n      <td>Doe</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>3</td>\n      <td>Someone Else</td>\n      <td>Someone</td>\n      <td>Else</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "#### You don't have to consume all the split values into multiple columns, you can choose to keep the first split value or second value using indexing ( [0] or [1] ):"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.read_clipboard()\ndf",
      "execution_count": 33,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 33,
          "data": {
            "text/plain": "   ID          LOCATION\n0   1   Los Angeles, CA\n1   2  Washington, D.C.\n2   3      Columbus, OH",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>LOCATION</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>Los Angeles, CA</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2</td>\n      <td>Washington, D.C.</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>3</td>\n      <td>Columbus, OH</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df['CITY'] = df['LOCATION'].str.split(', ', expand=True)[0]",
      "execution_count": 35,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df",
      "execution_count": 36,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 36,
          "data": {
            "text/plain": "   ID          LOCATION         CITY\n0   1   Los Angeles, CA  Los Angeles\n1   2  Washington, D.C.   Washington\n2   3      Columbus, OH     Columbus",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>LOCATION</th>\n      <th>CITY</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>Los Angeles, CA</td>\n      <td>Los Angeles</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2</td>\n      <td>Washington, D.C.</td>\n      <td>Washington</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>3</td>\n      <td>Columbus, OH</td>\n      <td>Columbus</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df['STATE'] = df['LOCATION'].str.split(', ', expand=True)[1]",
      "execution_count": 38,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df",
      "execution_count": 39,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 39,
          "data": {
            "text/plain": "   ID          LOCATION         CITY STATE\n0   1   Los Angeles, CA  Los Angeles    CA\n1   2  Washington, D.C.   Washington  D.C.\n2   3      Columbus, OH     Columbus    OH",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>LOCATION</th>\n      <th>CITY</th>\n      <th>STATE</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>Los Angeles, CA</td>\n      <td>Los Angeles</td>\n      <td>CA</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2</td>\n      <td>Washington, D.C.</td>\n      <td>Washington</td>\n      <td>D.C.</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>3</td>\n      <td>Columbus, OH</td>\n      <td>Columbus</td>\n      <td>OH</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Clean columns of a specific data type"
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "**SITUATION:** There are hashtag characters in your string columns.  You could have several string columns.  We need to remove them.  There are 3 different ways depending on level of control."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.read_clipboard()",
      "execution_count": 15,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df",
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 16,
          "data": {
            "text/plain": "  str_BadData1  Numeric1  Numeric2 str_BadData2 str_BadData3\n0      #Hello#         1         4      #Steve#      #Smith#\n1    #GoodBye#         2         5       #Jane#        #Doe#\n2    #Welcome#         3         6       #John#      #Hicks#",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>str_BadData1</th>\n      <th>Numeric1</th>\n      <th>Numeric2</th>\n      <th>str_BadData2</th>\n      <th>str_BadData3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>#Hello#</td>\n      <td>1</td>\n      <td>4</td>\n      <td>#Steve#</td>\n      <td>#Smith#</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>#GoodBye#</td>\n      <td>2</td>\n      <td>5</td>\n      <td>#Jane#</td>\n      <td>#Doe#</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>#Welcome#</td>\n      <td>3</td>\n      <td>6</td>\n      <td>#John#</td>\n      <td>#Hicks#</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "### Method 1: Using ```isinstance()``` to check for data type:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "def remove_hashtag1(df):\n    for column in df.columns:\n        # if first value of column is of type str, then remove the hashtags\n        if isinstance(df[column][0], str):\n            df[column] = df[column].str.replace('#', '')",
      "execution_count": 17,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "remove_hashtag1(df)",
      "execution_count": 18,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df",
      "execution_count": 19,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 19,
          "data": {
            "text/plain": "  str_BadData1  Numeric1  Numeric2 str_BadData2 str_BadData3\n0        Hello         1         4        Steve        Smith\n1      GoodBye         2         5         Jane          Doe\n2      Welcome         3         6         John        Hicks",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>str_BadData1</th>\n      <th>Numeric1</th>\n      <th>Numeric2</th>\n      <th>str_BadData2</th>\n      <th>str_BadData3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Hello</td>\n      <td>1</td>\n      <td>4</td>\n      <td>Steve</td>\n      <td>Smith</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>GoodBye</td>\n      <td>2</td>\n      <td>5</td>\n      <td>Jane</td>\n      <td>Doe</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Welcome</td>\n      <td>3</td>\n      <td>6</td>\n      <td>John</td>\n      <td>Hicks</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "### Method 2: Using ```dtype()``` to check for data type:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "def remove_hashtag2(df):\n    for column in df.columns:\n        # if column is of type 'object', then remove the hashtags\n        if df[column].dtype == 'object':\n            df[column] = df[column].str.replace('#', '')",
      "execution_count": 22,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.read_clipboard()\ndf",
      "execution_count": 20,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 20,
          "data": {
            "text/plain": "  str_BadData1  Numeric1  Numeric2 str_BadData2 str_BadData3\n0      #Hello#         1         4      #Steve#      #Smith#\n1    #GoodBye#         2         5       #Jane#        #Doe#\n2    #Welcome#         3         6       #John#      #Hicks#",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>str_BadData1</th>\n      <th>Numeric1</th>\n      <th>Numeric2</th>\n      <th>str_BadData2</th>\n      <th>str_BadData3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>#Hello#</td>\n      <td>1</td>\n      <td>4</td>\n      <td>#Steve#</td>\n      <td>#Smith#</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>#GoodBye#</td>\n      <td>2</td>\n      <td>5</td>\n      <td>#Jane#</td>\n      <td>#Doe#</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>#Welcome#</td>\n      <td>3</td>\n      <td>6</td>\n      <td>#John#</td>\n      <td>#Hicks#</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "remove_hashtag2(df)",
      "execution_count": 23,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df",
      "execution_count": 24,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 24,
          "data": {
            "text/plain": "  str_BadData1  Numeric1  Numeric2 str_BadData2 str_BadData3\n0        Hello         1         4        Steve        Smith\n1      GoodBye         2         5         Jane          Doe\n2      Welcome         3         6         John        Hicks",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>str_BadData1</th>\n      <th>Numeric1</th>\n      <th>Numeric2</th>\n      <th>str_BadData2</th>\n      <th>str_BadData3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Hello</td>\n      <td>1</td>\n      <td>4</td>\n      <td>Steve</td>\n      <td>Smith</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>GoodBye</td>\n      <td>2</td>\n      <td>5</td>\n      <td>Jane</td>\n      <td>Doe</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Welcome</td>\n      <td>3</td>\n      <td>6</td>\n      <td>John</td>\n      <td>Hicks</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "### Method 3: Using  ```is_string_dtype``` to detect column's data type:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "from pandas.api.types import is_string_dtype",
      "execution_count": 25,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.read_clipboard()\ndf",
      "execution_count": 26,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 26,
          "data": {
            "text/plain": "  str_BadData1  Numeric1  Numeric2 str_BadData2 str_BadData3\n0      #Hello#         1         4      #Steve#      #Smith#\n1    #GoodBye#         2         5       #Jane#        #Doe#\n2    #Welcome#         3         6       #John#      #Hicks#",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>str_BadData1</th>\n      <th>Numeric1</th>\n      <th>Numeric2</th>\n      <th>str_BadData2</th>\n      <th>str_BadData3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>#Hello#</td>\n      <td>1</td>\n      <td>4</td>\n      <td>#Steve#</td>\n      <td>#Smith#</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>#GoodBye#</td>\n      <td>2</td>\n      <td>5</td>\n      <td>#Jane#</td>\n      <td>#Doe#</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>#Welcome#</td>\n      <td>3</td>\n      <td>6</td>\n      <td>#John#</td>\n      <td>#Hicks#</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "def remove_hashtag3(df):\n    for column in df.columns:\n        # if column is of a string data type, then strip the white space\n        if is_string_dtype(df[column]):\n            df[column] = df[column].str.replace('#', '')",
      "execution_count": 29,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "remove_hashtag3(df)",
      "execution_count": 30,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df",
      "execution_count": 31,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 31,
          "data": {
            "text/plain": "  str_BadData1  Numeric1  Numeric2 str_BadData2 str_BadData3\n0        Hello         1         4        Steve        Smith\n1      GoodBye         2         5         Jane          Doe\n2      Welcome         3         6         John        Hicks",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>str_BadData1</th>\n      <th>Numeric1</th>\n      <th>Numeric2</th>\n      <th>str_BadData2</th>\n      <th>str_BadData3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Hello</td>\n      <td>1</td>\n      <td>4</td>\n      <td>Steve</td>\n      <td>Smith</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>GoodBye</td>\n      <td>2</td>\n      <td>5</td>\n      <td>Jane</td>\n      <td>Doe</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Welcome</td>\n      <td>3</td>\n      <td>6</td>\n      <td>John</td>\n      <td>Hicks</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Select Columns by Data Type"
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "**SITUATION:** You need to isolate numeric columns from string columns or vice versa"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.read_clipboard()\ndf",
      "execution_count": 52,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 52,
          "data": {
            "text/plain": "  str_BadData1  Numeric1  Numeric2 str_BadData2 str_BadData3\n0      #Hello#         1         4      #Steve#      #Smith#\n1    #GoodBye#         2         5       #Jane#        #Doe#\n2    #Welcome#         3         6       #John#      #Hicks#",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>str_BadData1</th>\n      <th>Numeric1</th>\n      <th>Numeric2</th>\n      <th>str_BadData2</th>\n      <th>str_BadData3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>#Hello#</td>\n      <td>1</td>\n      <td>4</td>\n      <td>#Steve#</td>\n      <td>#Smith#</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>#GoodBye#</td>\n      <td>2</td>\n      <td>5</td>\n      <td>#Jane#</td>\n      <td>#Doe#</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>#Welcome#</td>\n      <td>3</td>\n      <td>6</td>\n      <td>#John#</td>\n      <td>#Hicks#</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Get count of columns by data type:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df.dtypes.value_counts()",
      "execution_count": 55,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 55,
          "data": {
            "text/plain": "object    3\nint64     2\ndtype: int64"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df_num = df.select_dtypes(include='number')\ndf_num",
      "execution_count": 41,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 41,
          "data": {
            "text/plain": "   Numeric1  Numeric2\n0         1         4\n1         2         5\n2         3         6",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Numeric1</th>\n      <th>Numeric2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2</td>\n      <td>5</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>3</td>\n      <td>6</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df_str = df.select_dtypes(include='object')\ndf_str",
      "execution_count": 42,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 42,
          "data": {
            "text/plain": "  str_BadData1 str_BadData2 str_BadData3\n0      #Hello#      #Steve#      #Smith#\n1    #GoodBye#       #Jane#        #Doe#\n2    #Welcome#       #John#      #Hicks#",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>str_BadData1</th>\n      <th>str_BadData2</th>\n      <th>str_BadData3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>#Hello#</td>\n      <td>#Steve#</td>\n      <td>#Smith#</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>#GoodBye#</td>\n      <td>#Jane#</td>\n      <td>#Doe#</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>#Welcome#</td>\n      <td>#John#</td>\n      <td>#Hicks#</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Adding prefix or suffix to your column names"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.read_clipboard()\ndf",
      "execution_count": 50,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 50,
          "data": {
            "text/plain": "  str_BadData1  Numeric1  Numeric2 str_BadData2 str_BadData3\n0      #Hello#         1         4      #Steve#      #Smith#\n1    #GoodBye#         2         5       #Jane#        #Doe#\n2    #Welcome#         3         6       #John#      #Hicks#",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>str_BadData1</th>\n      <th>Numeric1</th>\n      <th>Numeric2</th>\n      <th>str_BadData2</th>\n      <th>str_BadData3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>#Hello#</td>\n      <td>1</td>\n      <td>4</td>\n      <td>#Steve#</td>\n      <td>#Smith#</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>#GoodBye#</td>\n      <td>2</td>\n      <td>5</td>\n      <td>#Jane#</td>\n      <td>#Doe#</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>#Welcome#</td>\n      <td>3</td>\n      <td>6</td>\n      <td>#John#</td>\n      <td>#Hicks#</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = df.add_prefix('BEGIN_')\ndf",
      "execution_count": 51,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 51,
          "data": {
            "text/plain": "  BEGIN_str_BadData1  BEGIN_Numeric1  BEGIN_Numeric2 BEGIN_str_BadData2  \\\n0            #Hello#               1               4            #Steve#   \n1          #GoodBye#               2               5             #Jane#   \n2          #Welcome#               3               6             #John#   \n\n  BEGIN_str_BadData3  \n0            #Smith#  \n1              #Doe#  \n2            #Hicks#  ",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>BEGIN_str_BadData1</th>\n      <th>BEGIN_Numeric1</th>\n      <th>BEGIN_Numeric2</th>\n      <th>BEGIN_str_BadData2</th>\n      <th>BEGIN_str_BadData3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>#Hello#</td>\n      <td>1</td>\n      <td>4</td>\n      <td>#Steve#</td>\n      <td>#Smith#</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>#GoodBye#</td>\n      <td>2</td>\n      <td>5</td>\n      <td>#Jane#</td>\n      <td>#Doe#</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>#Welcome#</td>\n      <td>3</td>\n      <td>6</td>\n      <td>#John#</td>\n      <td>#Hicks#</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = df.add_suffix('_END')\ndf",
      "execution_count": 49,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 49,
          "data": {
            "text/plain": "  str_BadData1_END  Numeric1_END  Numeric2_END str_BadData2_END  \\\n0          #Hello#             1             4          #Steve#   \n1        #GoodBye#             2             5           #Jane#   \n2        #Welcome#             3             6           #John#   \n\n  str_BadData3_END  \n0          #Smith#  \n1            #Doe#  \n2          #Hicks#  ",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>str_BadData1_END</th>\n      <th>Numeric1_END</th>\n      <th>Numeric2_END</th>\n      <th>str_BadData2_END</th>\n      <th>str_BadData3_END</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>#Hello#</td>\n      <td>1</td>\n      <td>4</td>\n      <td>#Steve#</td>\n      <td>#Smith#</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>#GoodBye#</td>\n      <td>2</td>\n      <td>5</td>\n      <td>#Jane#</td>\n      <td>#Doe#</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>#Welcome#</td>\n      <td>3</td>\n      <td>6</td>\n      <td>#John#</td>\n      <td>#Hicks#</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Finding duplicate records"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.read_clipboard()\ndf",
      "execution_count": 56,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 56,
          "data": {
            "text/plain": "    K1  K2\n0  one   1\n1  one   2\n2  one   3\n3  two   3\n4  two   3\n5  two   4\n6  two   4",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>K1</th>\n      <th>K2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>one</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>one</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>one</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>two</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>two</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>two</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>two</td>\n      <td>4</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "duplicate_records = df[df.duplicated()]\nduplicate_records",
      "execution_count": 57,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 57,
          "data": {
            "text/plain": "    K1  K2\n4  two   3\n6  two   4",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>K1</th>\n      <th>K2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>4</th>\n      <td>two</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>two</td>\n      <td>4</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Removing duplicate records"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.read_clipboard()\ndf",
      "execution_count": 58,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 58,
          "data": {
            "text/plain": "    K1  K2\n0  one   1\n1  one   2\n2  one   3\n3  two   3\n4  two   3\n5  two   4\n6  two   4",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>K1</th>\n      <th>K2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>one</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>one</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>one</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>two</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>two</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>two</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>two</td>\n      <td>4</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df.drop_duplicates()  # by default, duplicate is defined by all columns",
      "execution_count": 59,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 59,
          "data": {
            "text/plain": "    K1  K2\n0  one   1\n1  one   2\n2  one   3\n3  two   3\n5  two   4",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>K1</th>\n      <th>K2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>one</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>one</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>one</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>two</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>two</td>\n      <td>4</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "or define what is \"duplicate\" by column name(s):"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.read_clipboard()\ndf",
      "execution_count": 60,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 60,
          "data": {
            "text/plain": "    K1  K2\n0  one   1\n1  one   2\n2  one   3\n3  two   3\n4  two   3\n5  two   4\n6  two   4",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>K1</th>\n      <th>K2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>one</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>one</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>one</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>two</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>two</td>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>two</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>two</td>\n      <td>4</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df.drop_duplicates(subset='K1')  # Record is considered \"duplicated\" based on column K1",
      "execution_count": 61,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 61,
          "data": {
            "text/plain": "    K1  K2\n0  one   1\n3  two   3",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>K1</th>\n      <th>K2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>one</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>two</td>\n      <td>3</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Create a categorical column based on numeric values or a.k.a. assigning numbers to categories based on pre-defined ranges"
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "**SITUATION:** You have people's ages, but now you need to assign their age to: 'CHILD', 'ADULT', or 'ELDERLY' category based on their age"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "import numpy as np",
      "execution_count": 71,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Let's create 50 random integers to represent age whose values are between 1 and 99:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "random_integers = np.random.randint(1, 99, 50)",
      "execution_count": 77,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "random_integers",
      "execution_count": 78,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 78,
          "data": {
            "text/plain": "array([29, 20, 46, 89, 47, 23, 65, 49, 66, 31,  6, 65, 90, 30, 92, 63, 31,\n       18, 66, 58, 76, 17, 54, 96, 95, 34, 95, 97, 56, 53, 92, 95, 64, 83,\n       82, 25, 47, 33, 95, 60, 60, 98, 32, 93, 33, 28,  7, 60, 20,  7])"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = pd.DataFrame({'AGE': random_integers})",
      "execution_count": 79,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df.head(10)",
      "execution_count": 80,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 80,
          "data": {
            "text/plain": "   AGE\n0   29\n1   20\n2   46\n3   89\n4   47\n5   23\n6   65\n7   49\n8   66\n9   31",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>AGE</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>29</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>20</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>46</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>89</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>47</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>23</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>65</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>49</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>66</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>31</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df['AGE_GROUP'] = pd.cut(df['AGE'], bins=[0, 18, 65, 99], labels=['CHILD', 'ADULT', 'ELDERLY'])\ndf.head(15)",
      "execution_count": 81,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 81,
          "data": {
            "text/plain": "    AGE AGE_GROUP\n0    29     ADULT\n1    20     ADULT\n2    46     ADULT\n3    89   ELDERLY\n4    47     ADULT\n5    23     ADULT\n6    65     ADULT\n7    49     ADULT\n8    66   ELDERLY\n9    31     ADULT\n10    6     CHILD\n11   65     ADULT\n12   90   ELDERLY\n13   30     ADULT\n14   92   ELDERLY",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>AGE</th>\n      <th>AGE_GROUP</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>29</td>\n      <td>ADULT</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>20</td>\n      <td>ADULT</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>46</td>\n      <td>ADULT</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>89</td>\n      <td>ELDERLY</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>47</td>\n      <td>ADULT</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>23</td>\n      <td>ADULT</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>65</td>\n      <td>ADULT</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>49</td>\n      <td>ADULT</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>66</td>\n      <td>ELDERLY</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>31</td>\n      <td>ADULT</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>6</td>\n      <td>CHILD</td>\n    </tr>\n    <tr>\n      <th>11</th>\n      <td>65</td>\n      <td>ADULT</td>\n    </tr>\n    <tr>\n      <th>12</th>\n      <td>90</td>\n      <td>ELDERLY</td>\n    </tr>\n    <tr>\n      <th>13</th>\n      <td>30</td>\n      <td>ADULT</td>\n    </tr>\n    <tr>\n      <th>14</th>\n      <td>92</td>\n      <td>ELDERLY</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3",
      "language": "python"
    },
    "hide_input": false,
    "varInspector": {
      "window_display": false,
      "cols": {
        "lenName": 16,
        "lenType": 16,
        "lenVar": 40
      },
      "kernels_config": {
        "python": {
          "library": "var_list.py",
          "delete_cmd_prefix": "del ",
          "delete_cmd_postfix": "",
          "varRefreshCmd": "print(var_dic_list())"
        },
        "r": {
          "library": "var_list.r",
          "delete_cmd_prefix": "rm(",
          "delete_cmd_postfix": ") ",
          "varRefreshCmd": "cat(var_dic_list()) "
        }
      },
      "types_to_exclude": [
        "module",
        "function",
        "builtin_function_or_method",
        "instance",
        "_Feature"
      ]
    },
    "language_info": {
      "name": "python",
      "version": "3.6.5",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "gist": {
      "id": "738827fd0da1df378b9ed3c3d8072b10",
      "data": {
        "description": "pandas/TipsNTricks.ipynb",
        "public": true
      }
    },
    "_draft": {
      "nbviewer_url": "https://gist.github.com/738827fd0da1df378b9ed3c3d8072b10"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}