Skip to content

Instantly share code, notes, and snippets.

@josephkokchin
Last active July 13, 2019 14:28
Show Gist options
  • Save josephkokchin/99ab5802d32a251e2936f26c3d36ed9c to your computer and use it in GitHub Desktop.
Save josephkokchin/99ab5802d32a251e2936f26c3d36ed9c to your computer and use it in GitHub Desktop.
This is the list of the useful functions that I have documented for data science related projects.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import Template"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-06-13T14:32:24.688874Z",
"start_time": "2019-06-13T14:32:24.584949Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<style>.container { width:96% !important; }</style>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" <script type=\"text/javascript\">\n",
" window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
" if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
" if (typeof require !== 'undefined') {\n",
" require.undef(\"plotly\");\n",
" requirejs.config({\n",
" paths: {\n",
" 'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n",
" }\n",
" });\n",
" require(['plotly'], function(Plotly) {\n",
" window._Plotly = Plotly;\n",
" });\n",
" }\n",
" </script>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" <script type=\"text/javascript\">\n",
" window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
" if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
" if (typeof require !== 'undefined') {\n",
" require.undef(\"plotly\");\n",
" requirejs.config({\n",
" paths: {\n",
" 'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n",
" }\n",
" });\n",
" require(['plotly'], function(Plotly) {\n",
" window._Plotly = Plotly;\n",
" });\n",
" }\n",
" </script>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Notebook Mod\n",
"from IPython.core.display import display, HTML\n",
"display(HTML('<style>.container { width:96% !important; }</style>'))\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format = 'retina'\n",
"\n",
"# Data Manipulation \n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"# Data Visualisation\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"# Standard plotly imports\n",
"import plotly.plotly as py\n",
"import plotly.graph_objs as go\n",
"from plotly.offline import iplot, init_notebook_mode, plot, download_plotlyjs\n",
"\n",
"# Using plotly + cufflinks in offline mode\n",
"import cufflinks\n",
"cufflinks.go_offline(connected=True)\n",
"init_notebook_mode(connected=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Quick Check"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# See the column data types and non-missing values\n",
"data.info\n",
"\n",
"# Stats for each column\n",
"data.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Wrangling"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Chunking your data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def chunk_data(file,chunksize=100000):\n",
" data_iterator = pd.read_csv(\"large_data.csv\", chunksize)\n",
" \n",
" chunk_list = [] \n",
" \n",
" # Each chunk is in dataframe format\n",
" for data_chunk in data_iterator: \n",
" #filtered_chunk = chunk_filtering(data_chunk)\n",
" chunk_list.append(data_chunk)\n",
" \n",
" data = pd.concat(chunk_list)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Changing Data Type"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"large_data.csv\", dtype={'column_A': np.int32, 'column_B': np.float16})"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def change_dtypes(col_int, col_float, df): \n",
" '''\n",
" AIM -> Changing dtypes to save memory\n",
" \n",
" INPUT -> List of column names (int, float), df\n",
" \n",
" OUTPUT -> updated df with smaller memory \n",
" ------\n",
" '''\n",
" df[col_int] = df[col_int].astype('int32')\n",
" df[col_float] = df[col_float].astype('float32')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Restructure strings in columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def remove_col_str(df,col):\n",
" # remove a portion of string in a dataframe column - col_1\n",
" df[col].replace('[{}]', '', regex=True, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Remove white space"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def remove_col_white_space(df,col):\n",
" '''\n",
" remove white space at the beginning of string \n",
" '''\n",
" df[col] = df[col].str.lstrip()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Rename column names"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def rename_col(df): \n",
" '''\n",
" AIM -> rename column names\n",
" \n",
" INPUT -> df\n",
" \n",
" OUTPUT -> updated df with new column names \n",
" ------\n",
" '''\n",
" df.rename(index=str, columns={'col_1': 'new_col_1','col_2': 'new_col_1'}, inplace=True)\n",
" return df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Convert Dictionary within Column to New Columns"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def extract_dict(df,col):\n",
" \"\"\"\n",
" AIM: Extract the dictionaries within a column and create new columns\n",
" \"\"\"\n",
" return pd.concat([df.drop(col, axis=1), df[col].apply(pd.Series)], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# If the list or dictionary is string type\n",
"\n",
"# Turn the string into a list\n",
"def string_to_list:\n",
" return df[col].str.strip('{}[]').str.split(',')\n",
"\n",
"# Turn the string to a list and concat them into the dataframe\n",
"def string_to_columns:\n",
" return pd.concat([df[col].str.strip('{}[]').str.split(',').apply(pd.Series)], axis=1)\n",
"\n",
"# Turn the string to a list calculate the number of items in each list\n",
"def string_list_length:\n",
" return df[col].str.strip('{}[]').str.split(',').apply(len)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save as Excel"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def save_excel(df,index=True):\n",
" writer = pd.ExcelWriter('../{}.xlsx'.format(str(df)), engine='xlsxwriter')\n",
" df.to_excel(writer, index, sheet_name=str(df))\n",
" writer.save()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Concatenate two columns with strings (with condition)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def concat_col_str_condition(df):\n",
" # concat 2 columns with strings if the last 3 letters of the first column are 'pil'\n",
" mask = df['col_1'].str.endswith('pil', na=False)\n",
" col_new = df[mask]['col_1'] + df[mask]['col_2']\n",
" col_new.replace('pil', ' ', regex=True, inplace=True) # replace the 'pil' with emtpy space"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TimeStamp"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Convert timestamp(from string to datetime format)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def convert_str_datetime(df,col,newcol=False): \n",
" '''\n",
" AIM -> Convert datetime(String) to datetime(format we want)\n",
" \n",
" INPUT -> df, col, newcol\n",
" \n",
" OUTPUT -> updated df with new datetime format \n",
" ------\n",
" '''\n",
" if newcol=False:\n",
" return df[col]=pd.to_datetime(df.col, format='%Y-%m-%d %H:%M:%S.%f')\n",
" else:\n",
" return df.concat(column=col, value=pd.to_datetime(df.col, format='%Y-%m-%d %H:%M:%S.%f')) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Extract the Component TimeStanp"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### DateTimeIndex Method \n",
"\n",
"[Source](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.html)\n",
"\n",
"The output format is datetime/timestamp."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def timestamp_comp_id(df,col,attr='year'):\n",
" \n",
" '''\n",
" AIM: To Extract A Part of DateTime/TimeStamp in a column\n",
" '''\n",
" \n",
" if attr=='year':\n",
" return pd.DatetimeIndex(df[col]).year\n",
" elif attr=='month':\n",
" return pd.DatetimeIndex(df[col]).month\n",
" elif attr=='day':\n",
" return pd.DatetimeIndex(df[col]).day\n",
" elif attr=='date':\n",
" return pd.DatetimeIndex(df[col]).date\n",
" elif attr=='time':\n",
" return pd.DatetimeIndex(df[col]).time\n",
" else:\n",
" print('Sorry the attribute stated is not available!')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### List Compression Method\n",
"The output dtype is string."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def timestamp_comp_str(df,col,attr='MMDDYYYY'):\n",
" \n",
" '''\n",
" AIM: To Extract A Part of DateTime/TimeStamp in a column and turn into STRING\n",
" '''\n",
" \n",
" col_new=str(col)+'_'+str(attr)\n",
" \n",
" if attr=='MMDDYYYY':\n",
" df[col_new]=[x.strftime('%m/%d/%Y') for x in df[col]]\n",
" elif attr=='Full':\n",
" df[col_new]=[x.strftime('%Y-%m-%d %H:%M:%S.%f') for x in df[col]]\n",
" elif attr=='time':\n",
" df[col_new]=[x.strftime('%H:%M:%S.%f') for x in df[col]]\n",
" elif attr=='isoformat':\n",
" ## 2015-10-01T00:24:42\n",
" df[col_new]=[x.isoformat for x in df[col]] \n",
" elif attr=='ctime':\n",
" ## Thu Oct 1 00:24:42 2015\n",
" df[col_new]=[x.ctime for x in df[col]]\n",
" elif attr=='str':\n",
" ## 2015-10-01 00:24:42\n",
" df[col_new]=[x.__str__() for x in df[col]]\n",
" else:\n",
" print('Sorry the attribute stated is not available!')\n",
" \n",
" return df.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# EDA"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Check Missing Columns and %Missing"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Function to calculate missing values by column\n",
"def missing_values_table(df):\n",
" # Total missing values\n",
" mis_val = df.isnull().sum()\n",
" \n",
" # Percentage of missing values\n",
" mis_val_percent = 100 * df.isnull().sum() / len(df)\n",
" \n",
" # Make a table with the results\n",
" mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)\n",
" \n",
" # Rename the columns\n",
" mis_val_table_ren_columns = mis_val_table.rename(\n",
" columns = {0 : 'Missing Values', 1 : '% of Total Values'})\n",
" \n",
" # Sort the table by percentage of missing descending\n",
" mis_val_table_ren_columns = mis_val_table_ren_columns[\n",
" mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(\n",
" '% of Total Values', ascending=False).round(1)\n",
" \n",
" # Print some summary information\n",
" print (\"Your selected dataframe has \" + str(df.shape[1]) + \" columns.\\n\" \n",
" \"There are \" + str(mis_val_table_ren_columns.shape[0]) +\n",
" \" columns that have missing values.\")\n",
" \n",
" # Return the dataframe with missing information\n",
" return mis_val_table_ren_columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get the columns with > 50% missing"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-30T15:55:49.926337Z",
"start_time": "2019-05-30T15:55:41.903967Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Request: 5; Frequency: 0.6235897396860618 requests/s\n"
]
}
],
"source": [
"missing_df = missing_values_table(data);\n",
"\n",
"def missing_filter(df, threshold=50):\n",
" missing_df = missing_values_table(df)\n",
" missing_columns = list(missing_df[missing_df['% of Total Values'] > int(threshold)].index)\n",
" print('We will remove {} columns.'.format(len(missing_columns)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Drop the missing columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = data.drop(columns = list(missing_columns))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Column Types Checking"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Number of each type of column\n",
"app_train.dtypes.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Number of unique classes in each object column\n",
"app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)"
]
}
],
"metadata": {
"@webio": {
"lastCommId": "6c81f240e0ca4dd699fafeb5b2662880",
"lastKernelId": "07c27537-bc0d-44ed-8043-6b1af3f51b06"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"latex_envs": {
"LaTeX_envs_menu_present": true,
"autoclose": false,
"autocomplete": true,
"bibliofile": "biblio.bib",
"cite_by": "apalike",
"current_citInitial": 1,
"eqLabelWithNumbers": true,
"eqNumInitial": 1,
"hotkeys": {
"equation": "Ctrl-E",
"itemize": "Ctrl-I"
},
"labels_anchors": false,
"latex_user_defs": false,
"report_style_numbering": false,
"user_envs_cfg": false
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"toc-autonumbering": true,
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment