josephkokchin/Data_Science_Made_Easy_Code.ipynb

## Data_Science_Made_Easy_Code.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import Template"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-13T14:32:24.688874Z",
     "start_time": "2019-06-13T14:32:24.584949Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>.container { width:96% !important; }</style>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "        <script type=\"text/javascript\">\n",
       "        window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
       "        if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
       "        if (typeof require !== 'undefined') {\n",
       "        require.undef(\"plotly\");\n",
       "        requirejs.config({\n",
       "            paths: {\n",
       "                'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n",
       "            }\n",
       "        });\n",
       "        require(['plotly'], function(Plotly) {\n",
       "            window._Plotly = Plotly;\n",
       "        });\n",
       "        }\n",
       "        </script>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "        <script type=\"text/javascript\">\n",
       "        window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
       "        if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
       "        if (typeof require !== 'undefined') {\n",
       "        require.undef(\"plotly\");\n",
       "        requirejs.config({\n",
       "            paths: {\n",
       "                'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n",
       "            }\n",
       "        });\n",
       "        require(['plotly'], function(Plotly) {\n",
       "            window._Plotly = Plotly;\n",
       "        });\n",
       "        }\n",
       "        </script>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Notebook Mod\n",
    "from IPython.core.display import display, HTML\n",
    "display(HTML('<style>.container { width:96% !important; }</style>'))\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format = 'retina'\n",
    "\n",
    "# Data Manipulation \n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# Data Visualisation\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Standard plotly imports\n",
    "import plotly.plotly as py\n",
    "import plotly.graph_objs as go\n",
    "from plotly.offline import iplot, init_notebook_mode, plot, download_plotlyjs\n",
    "\n",
    "# Using plotly + cufflinks in offline mode\n",
    "import cufflinks\n",
    "cufflinks.go_offline(connected=True)\n",
    "init_notebook_mode(connected=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Quick Check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# See the column data types and non-missing values\n",
    "data.info\n",
    "\n",
    "# Stats for each column\n",
    "data.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Wrangling"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Chunking your data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def chunk_data(file,chunksize=100000):\n",
    "    data_iterator = pd.read_csv(\"large_data.csv\", chunksize)\n",
    "    \n",
    "    chunk_list = []  \n",
    "    \n",
    "    # Each chunk is in dataframe format\n",
    "    for data_chunk in data_iterator:  \n",
    "        #filtered_chunk = chunk_filtering(data_chunk)\n",
    "        chunk_list.append(data_chunk)\n",
    "    \n",
    "    data = pd.concat(chunk_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Changing Data Type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"large_data.csv\", dtype={'column_A': np.int32, 'column_B': np.float16})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def change_dtypes(col_int, col_float, df): \n",
    "    '''\n",
    "    AIM    -> Changing dtypes to save memory\n",
    "     \n",
    "    INPUT  -> List of column names (int, float), df\n",
    "    \n",
    "    OUTPUT -> updated df with smaller memory  \n",
    "    ------\n",
    "    '''\n",
    "    df[col_int] = df[col_int].astype('int32')\n",
    "    df[col_float] = df[col_float].astype('float32')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Restructure strings in columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_col_str(df,col):\n",
    "    # remove a portion of string in a dataframe column - col_1\n",
    "    df[col].replace('[{}]', '', regex=True, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Remove white space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_col_white_space(df,col):\n",
    "    '''\n",
    "    remove white space at the beginning of string \n",
    "    '''\n",
    "    df[col] = df[col].str.lstrip()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Rename column names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def rename_col(df): \n",
    "    '''\n",
    "    AIM    -> rename column names\n",
    "    \n",
    "    INPUT  -> df\n",
    "    \n",
    "    OUTPUT -> updated df with new column names \n",
    "    ------\n",
    "    '''\n",
    "    df.rename(index=str, columns={'col_1': 'new_col_1','col_2': 'new_col_1'}, inplace=True)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert Dictionary within Column to New Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_dict(df,col):\n",
    "    \"\"\"\n",
    "    AIM: Extract the dictionaries within a column and create new columns\n",
    "    \"\"\"\n",
    "    return pd.concat([df.drop(col, axis=1), df[col].apply(pd.Series)], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# If the list or dictionary is string type\n",
    "\n",
    "# Turn the string into a list\n",
    "def string_to_list:\n",
    "    return df[col].str.strip('{}[]').str.split(',')\n",
    "\n",
    "# Turn the string to a list and concat them into the dataframe\n",
    "def string_to_columns:\n",
    "    return pd.concat([df[col].str.strip('{}[]').str.split(',').apply(pd.Series)], axis=1)\n",
    "\n",
    "# Turn the string to a list calculate the number of items in each list\n",
    "def string_list_length:\n",
    "    return df[col].str.strip('{}[]').str.split(',').apply(len)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save as Excel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_excel(df,index=True):\n",
    "    writer = pd.ExcelWriter('../{}.xlsx'.format(str(df)), engine='xlsxwriter')\n",
    "    df.to_excel(writer, index, sheet_name=str(df))\n",
    "    writer.save()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Concatenate two columns with strings (with condition)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def concat_col_str_condition(df):\n",
    "    # concat 2 columns with strings if the last 3 letters of the first column are 'pil'\n",
    "    mask = df['col_1'].str.endswith('pil', na=False)\n",
    "    col_new = df[mask]['col_1'] + df[mask]['col_2']\n",
    "    col_new.replace('pil', ' ', regex=True, inplace=True)  # replace the 'pil' with emtpy space"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TimeStamp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert timestamp(from string to datetime format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_str_datetime(df,col,newcol=False): \n",
    "    '''\n",
    "    AIM    -> Convert datetime(String) to datetime(format we want)\n",
    "     \n",
    "    INPUT  -> df, col, newcol\n",
    "    \n",
    "    OUTPUT -> updated df with new datetime format \n",
    "    ------\n",
    "    '''\n",
    "    if newcol=False:\n",
    "        return df[col]=pd.to_datetime(df.col, format='%Y-%m-%d %H:%M:%S.%f')\n",
    "    else:\n",
    "        return df.concat(column=col, value=pd.to_datetime(df.col, format='%Y-%m-%d %H:%M:%S.%f')) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extract the Component TimeStanp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### DateTimeIndex Method \n",
    "\n",
    "[Source](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.html)\n",
    "\n",
    "The output format is datetime/timestamp."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def timestamp_comp_id(df,col,attr='year'):\n",
    "    \n",
    "    '''\n",
    "    AIM: To Extract A Part of DateTime/TimeStamp in a column\n",
    "    '''\n",
    "    \n",
    "    if attr=='year':\n",
    "        return pd.DatetimeIndex(df[col]).year\n",
    "    elif attr=='month':\n",
    "        return pd.DatetimeIndex(df[col]).month\n",
    "    elif attr=='day':\n",
    "        return pd.DatetimeIndex(df[col]).day\n",
    "    elif attr=='date':\n",
    "        return pd.DatetimeIndex(df[col]).date\n",
    "    elif attr=='time':\n",
    "        return pd.DatetimeIndex(df[col]).time\n",
    "    else:\n",
    "        print('Sorry the attribute stated is not available!')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### List Compression Method\n",
    "The output dtype is string."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def timestamp_comp_str(df,col,attr='MMDDYYYY'):\n",
    "    \n",
    "    '''\n",
    "    AIM: To Extract A Part of DateTime/TimeStamp in a column and turn into STRING\n",
    "    '''\n",
    "    \n",
    "    col_new=str(col)+'_'+str(attr)\n",
    "    \n",
    "    if attr=='MMDDYYYY':\n",
    "        df[col_new]=[x.strftime('%m/%d/%Y') for x in df[col]]\n",
    "    elif attr=='Full':\n",
    "        df[col_new]=[x.strftime('%Y-%m-%d %H:%M:%S.%f') for x in df[col]]\n",
    "    elif attr=='time':\n",
    "        df[col_new]=[x.strftime('%H:%M:%S.%f') for x in df[col]]\n",
    "    elif attr=='isoformat':\n",
    "    ## 2015-10-01T00:24:42\n",
    "        df[col_new]=[x.isoformat for x in df[col]] \n",
    "    elif attr=='ctime':\n",
    "    ## Thu Oct  1 00:24:42 2015\n",
    "        df[col_new]=[x.ctime for x in df[col]]\n",
    "    elif attr=='str':\n",
    "    ## 2015-10-01 00:24:42\n",
    "        df[col_new]=[x.__str__() for x in df[col]]\n",
    "    else:\n",
    "        print('Sorry the attribute stated is not available!')\n",
    "    \n",
    "    return df.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# EDA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Check Missing Columns and %Missing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to calculate missing values by column\n",
    "def missing_values_table(df):\n",
    "        # Total missing values\n",
    "        mis_val = df.isnull().sum()\n",
    "        \n",
    "        # Percentage of missing values\n",
    "        mis_val_percent = 100 * df.isnull().sum() / len(df)\n",
    "        \n",
    "        # Make a table with the results\n",
    "        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)\n",
    "        \n",
    "        # Rename the columns\n",
    "        mis_val_table_ren_columns = mis_val_table.rename(\n",
    "        columns = {0 : 'Missing Values', 1 : '% of Total Values'})\n",
    "        \n",
    "        # Sort the table by percentage of missing descending\n",
    "        mis_val_table_ren_columns = mis_val_table_ren_columns[\n",
    "            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(\n",
    "        '% of Total Values', ascending=False).round(1)\n",
    "        \n",
    "        # Print some summary information\n",
    "        print (\"Your selected dataframe has \" + str(df.shape[1]) + \" columns.\\n\"      \n",
    "            \"There are \" + str(mis_val_table_ren_columns.shape[0]) +\n",
    "              \" columns that have missing values.\")\n",
    "        \n",
    "        # Return the dataframe with missing information\n",
    "        return mis_val_table_ren_columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get the columns with > 50% missing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-30T15:55:49.926337Z",
     "start_time": "2019-05-30T15:55:41.903967Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Request: 5; Frequency: 0.6235897396860618 requests/s\n"
     ]
    }
   ],
   "source": [
    "missing_df = missing_values_table(data);\n",
    "\n",
    "def missing_filter(df, threshold=50):\n",
    "    missing_df = missing_values_table(df)\n",
    "    missing_columns = list(missing_df[missing_df['% of Total Values'] > int(threshold)].index)\n",
    "    print('We will remove {} columns.'.format(len(missing_columns)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Drop the missing columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.drop(columns = list(missing_columns))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Column Types Checking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Number of each type of column\n",
    "app_train.dtypes.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Number of unique classes in each object column\n",
    "app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)"
   ]
  }
 ],
 "metadata": {
  "@webio": {
   "lastCommId": "6c81f240e0ca4dd699fafeb5b2662880",
   "lastKernelId": "07c27537-bc0d-44ed-8043-6b1af3f51b06"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "toc-autonumbering": true,
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Import Template"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-06-13T14:32:24.688874Z",
	"start_time": "2019-06-13T14:32:24.584949Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<style>.container { width:96% !important; }</style>"
	],
	"text/plain": [
	"<IPython.core.display.HTML object>"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"text/html": [
	" <script type=\"text/javascript\">\n",
	" window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
	" if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
	" if (typeof require !== 'undefined') {\n",
	" require.undef(\"plotly\");\n",
	" requirejs.config({\n",
	" paths: {\n",
	" 'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n",
	" }\n",
	" });\n",
	" require(['plotly'], function(Plotly) {\n",
	" window._Plotly = Plotly;\n",
	" });\n",
	" }\n",
	" </script>\n",
	" "
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"text/html": [
	" <script type=\"text/javascript\">\n",
	" window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
	" if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
	" if (typeof require !== 'undefined') {\n",
	" require.undef(\"plotly\");\n",
	" requirejs.config({\n",
	" paths: {\n",
	" 'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n",
	" }\n",
	" });\n",
	" require(['plotly'], function(Plotly) {\n",
	" window._Plotly = Plotly;\n",
	" });\n",
	" }\n",
	" </script>\n",
	" "
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"# Notebook Mod\n",
	"from IPython.core.display import display, HTML\n",
	"display(HTML('<style>.container { width:96% !important; }</style>'))\n",
	"%matplotlib inline\n",
	"%config InlineBackend.figure_format = 'retina'\n",
	"\n",
	"# Data Manipulation \n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"\n",
	"# Data Visualisation\n",
	"import matplotlib.pyplot as plt\n",
	"import seaborn as sns\n",
	"\n",
	"# Standard plotly imports\n",
	"import plotly.plotly as py\n",
	"import plotly.graph_objs as go\n",
	"from plotly.offline import iplot, init_notebook_mode, plot, download_plotlyjs\n",
	"\n",
	"# Using plotly + cufflinks in offline mode\n",
	"import cufflinks\n",
	"cufflinks.go_offline(connected=True)\n",
	"init_notebook_mode(connected=True)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Quick Check"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# See the column data types and non-missing values\n",
	"data.info\n",
	"\n",
	"# Stats for each column\n",
	"data.describe()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Data Wrangling"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Chunking your data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def chunk_data(file,chunksize=100000):\n",
	" data_iterator = pd.read_csv(\"large_data.csv\", chunksize)\n",
	" \n",
	" chunk_list = [] \n",
	" \n",
	" # Each chunk is in dataframe format\n",
	" for data_chunk in data_iterator: \n",
	" #filtered_chunk = chunk_filtering(data_chunk)\n",
	" chunk_list.append(data_chunk)\n",
	" \n",
	" data = pd.concat(chunk_list)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Changing Data Type"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.read_csv(\"large_data.csv\", dtype={'column_A': np.int32, 'column_B': np.float16})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"def change_dtypes(col_int, col_float, df): \n",
	" '''\n",
	" AIM -> Changing dtypes to save memory\n",
	" \n",
	" INPUT -> List of column names (int, float), df\n",
	" \n",
	" OUTPUT -> updated df with smaller memory \n",
	" ------\n",
	" '''\n",
	" df[col_int] = df[col_int].astype('int32')\n",
	" df[col_float] = df[col_float].astype('float32')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Restructure strings in columns"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def remove_col_str(df,col):\n",
	" # remove a portion of string in a dataframe column - col_1\n",
	" df[col].replace('[{}]', '', regex=True, inplace=True)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Remove white space"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def remove_col_white_space(df,col):\n",
	" '''\n",
	" remove white space at the beginning of string \n",
	" '''\n",
	" df[col] = df[col].str.lstrip()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Rename column names"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def rename_col(df): \n",
	" '''\n",
	" AIM -> rename column names\n",
	" \n",
	" INPUT -> df\n",
	" \n",
	" OUTPUT -> updated df with new column names \n",
	" ------\n",
	" '''\n",
	" df.rename(index=str, columns={'col_1': 'new_col_1','col_2': 'new_col_1'}, inplace=True)\n",
	" return df"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Convert Dictionary within Column to New Columns"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"def extract_dict(df,col):\n",
	" \"\"\"\n",
	" AIM: Extract the dictionaries within a column and create new columns\n",
	" \"\"\"\n",
	" return pd.concat([df.drop(col, axis=1), df[col].apply(pd.Series)], axis=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# If the list or dictionary is string type\n",
	"\n",
	"# Turn the string into a list\n",
	"def string_to_list:\n",
	" return df[col].str.strip('{}[]').str.split(',')\n",
	"\n",
	"# Turn the string to a list and concat them into the dataframe\n",
	"def string_to_columns:\n",
	" return pd.concat([df[col].str.strip('{}[]').str.split(',').apply(pd.Series)], axis=1)\n",
	"\n",
	"# Turn the string to a list calculate the number of items in each list\n",
	"def string_list_length:\n",
	" return df[col].str.strip('{}[]').str.split(',').apply(len)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Save as Excel"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def save_excel(df,index=True):\n",
	" writer = pd.ExcelWriter('../{}.xlsx'.format(str(df)), engine='xlsxwriter')\n",
	" df.to_excel(writer, index, sheet_name=str(df))\n",
	" writer.save()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Concatenate two columns with strings (with condition)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def concat_col_str_condition(df):\n",
	" # concat 2 columns with strings if the last 3 letters of the first column are 'pil'\n",
	" mask = df['col_1'].str.endswith('pil', na=False)\n",
	" col_new = df[mask]['col_1'] + df[mask]['col_2']\n",
	" col_new.replace('pil', ' ', regex=True, inplace=True) # replace the 'pil' with emtpy space"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## TimeStamp"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Convert timestamp(from string to datetime format)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def convert_str_datetime(df,col,newcol=False): \n",
	" '''\n",
	" AIM -> Convert datetime(String) to datetime(format we want)\n",
	" \n",
	" INPUT -> df, col, newcol\n",
	" \n",
	" OUTPUT -> updated df with new datetime format \n",
	" ------\n",
	" '''\n",
	" if newcol=False:\n",
	" return df[col]=pd.to_datetime(df.col, format='%Y-%m-%d %H:%M:%S.%f')\n",
	" else:\n",
	" return df.concat(column=col, value=pd.to_datetime(df.col, format='%Y-%m-%d %H:%M:%S.%f')) "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Extract the Component TimeStanp"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### DateTimeIndex Method \n",
	"\n",
	"[Source](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.html)\n",
	"\n",
	"The output format is datetime/timestamp."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"def timestamp_comp_id(df,col,attr='year'):\n",
	" \n",
	" '''\n",
	" AIM: To Extract A Part of DateTime/TimeStamp in a column\n",
	" '''\n",
	" \n",
	" if attr=='year':\n",
	" return pd.DatetimeIndex(df[col]).year\n",
	" elif attr=='month':\n",
	" return pd.DatetimeIndex(df[col]).month\n",
	" elif attr=='day':\n",
	" return pd.DatetimeIndex(df[col]).day\n",
	" elif attr=='date':\n",
	" return pd.DatetimeIndex(df[col]).date\n",
	" elif attr=='time':\n",
	" return pd.DatetimeIndex(df[col]).time\n",
	" else:\n",
	" print('Sorry the attribute stated is not available!')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### List Compression Method\n",
	"The output dtype is string."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"def timestamp_comp_str(df,col,attr='MMDDYYYY'):\n",
	" \n",
	" '''\n",
	" AIM: To Extract A Part of DateTime/TimeStamp in a column and turn into STRING\n",
	" '''\n",
	" \n",
	" col_new=str(col)+'_'+str(attr)\n",
	" \n",
	" if attr=='MMDDYYYY':\n",
	" df[col_new]=[x.strftime('%m/%d/%Y') for x in df[col]]\n",
	" elif attr=='Full':\n",
	" df[col_new]=[x.strftime('%Y-%m-%d %H:%M:%S.%f') for x in df[col]]\n",
	" elif attr=='time':\n",
	" df[col_new]=[x.strftime('%H:%M:%S.%f') for x in df[col]]\n",
	" elif attr=='isoformat':\n",
	" ## 2015-10-01T00:24:42\n",
	" df[col_new]=[x.isoformat for x in df[col]] \n",
	" elif attr=='ctime':\n",
	" ## Thu Oct 1 00:24:42 2015\n",
	" df[col_new]=[x.ctime for x in df[col]]\n",
	" elif attr=='str':\n",
	" ## 2015-10-01 00:24:42\n",
	" df[col_new]=[x.__str__() for x in df[col]]\n",
	" else:\n",
	" print('Sorry the attribute stated is not available!')\n",
	" \n",
	" return df.head(5)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# EDA"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Check Missing Columns and %Missing"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Function to calculate missing values by column\n",
	"def missing_values_table(df):\n",
	" # Total missing values\n",
	" mis_val = df.isnull().sum()\n",
	" \n",
	" # Percentage of missing values\n",
	" mis_val_percent = 100 * df.isnull().sum() / len(df)\n",
	" \n",
	" # Make a table with the results\n",
	" mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)\n",
	" \n",
	" # Rename the columns\n",
	" mis_val_table_ren_columns = mis_val_table.rename(\n",
	" columns = {0 : 'Missing Values', 1 : '% of Total Values'})\n",
	" \n",
	" # Sort the table by percentage of missing descending\n",
	" mis_val_table_ren_columns = mis_val_table_ren_columns[\n",
	" mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(\n",
	" '% of Total Values', ascending=False).round(1)\n",
	" \n",
	" # Print some summary information\n",
	" print (\"Your selected dataframe has \" + str(df.shape[1]) + \" columns.\\n\" \n",
	" \"There are \" + str(mis_val_table_ren_columns.shape[0]) +\n",
	" \" columns that have missing values.\")\n",
	" \n",
	" # Return the dataframe with missing information\n",
	" return mis_val_table_ren_columns"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Get the columns with > 50% missing"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-05-30T15:55:49.926337Z",
	"start_time": "2019-05-30T15:55:41.903967Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Request: 5; Frequency: 0.6235897396860618 requests/s\n"
	]
	}
	],
	"source": [
	"missing_df = missing_values_table(data);\n",
	"\n",
	"def missing_filter(df, threshold=50):\n",
	" missing_df = missing_values_table(df)\n",
	" missing_columns = list(missing_df[missing_df['% of Total Values'] > int(threshold)].index)\n",
	" print('We will remove {} columns.'.format(len(missing_columns)))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Drop the missing columns"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"data = data.drop(columns = list(missing_columns))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Column Types Checking"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Number of each type of column\n",
	"app_train.dtypes.value_counts()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Number of unique classes in each object column\n",
	"app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)"
	]
	}
	],
	"metadata": {
	"@webio": {
	"lastCommId": "6c81f240e0ca4dd699fafeb5b2662880",
	"lastKernelId": "07c27537-bc0d-44ed-8043-6b1af3f51b06"
	},
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	},
	"latex_envs": {
	"LaTeX_envs_menu_present": true,
	"autoclose": false,
	"autocomplete": true,
	"bibliofile": "biblio.bib",
	"cite_by": "apalike",
	"current_citInitial": 1,
	"eqLabelWithNumbers": true,
	"eqNumInitial": 1,
	"hotkeys": {
	"equation": "Ctrl-E",
	"itemize": "Ctrl-I"
	},
	"labels_anchors": false,
	"latex_user_defs": false,
	"report_style_numbering": false,
	"user_envs_cfg": false
	},
	"toc": {
	"base_numbering": 1,
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": false,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": false
	},
	"toc-autonumbering": true,
	"varInspector": {
	"cols": {
	"lenName": 16,
	"lenType": 16,
	"lenVar": 40
	},
	"kernels_config": {
	"python": {
	"delete_cmd_postfix": "",
	"delete_cmd_prefix": "del ",
	"library": "var_list.py",
	"varRefreshCmd": "print(var_dic_list())"
	},
	"r": {
	"delete_cmd_postfix": ") ",
	"delete_cmd_prefix": "rm(",
	"library": "var_list.r",
	"varRefreshCmd": "cat(var_dic_list()) "
	}
	},
	"types_to_exclude": [
	"module",
	"function",
	"builtin_function_or_method",
	"instance",
	"_Feature"
	],
	"window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}