Last active
July 13, 2019 14:28
-
-
Save josephkokchin/99ab5802d32a251e2936f26c3d36ed9c to your computer and use it in GitHub Desktop.
This is the list of the useful functions that I have documented for data science related projects.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Import Template" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-06-13T14:32:24.688874Z", | |
"start_time": "2019-06-13T14:32:24.584949Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>.container { width:96% !important; }</style>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
" <script type=\"text/javascript\">\n", | |
" window.PlotlyConfig = {MathJaxConfig: 'local'};\n", | |
" if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n", | |
" if (typeof require !== 'undefined') {\n", | |
" require.undef(\"plotly\");\n", | |
" requirejs.config({\n", | |
" paths: {\n", | |
" 'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n", | |
" }\n", | |
" });\n", | |
" require(['plotly'], function(Plotly) {\n", | |
" window._Plotly = Plotly;\n", | |
" });\n", | |
" }\n", | |
" </script>\n", | |
" " | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
" <script type=\"text/javascript\">\n", | |
" window.PlotlyConfig = {MathJaxConfig: 'local'};\n", | |
" if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n", | |
" if (typeof require !== 'undefined') {\n", | |
" require.undef(\"plotly\");\n", | |
" requirejs.config({\n", | |
" paths: {\n", | |
" 'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n", | |
" }\n", | |
" });\n", | |
" require(['plotly'], function(Plotly) {\n", | |
" window._Plotly = Plotly;\n", | |
" });\n", | |
" }\n", | |
" </script>\n", | |
" " | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Notebook Mod\n", | |
"from IPython.core.display import display, HTML\n", | |
"display(HTML('<style>.container { width:96% !important; }</style>'))\n", | |
"%matplotlib inline\n", | |
"%config InlineBackend.figure_format = 'retina'\n", | |
"\n", | |
"# Data Manipulation \n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"\n", | |
"# Data Visualisation\n", | |
"import matplotlib.pyplot as plt\n", | |
"import seaborn as sns\n", | |
"\n", | |
"# Standard plotly imports\n", | |
"import plotly.plotly as py\n", | |
"import plotly.graph_objs as go\n", | |
"from plotly.offline import iplot, init_notebook_mode, plot, download_plotlyjs\n", | |
"\n", | |
"# Using plotly + cufflinks in offline mode\n", | |
"import cufflinks\n", | |
"cufflinks.go_offline(connected=True)\n", | |
"init_notebook_mode(connected=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Quick Check" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# See the column data types and non-missing values\n", | |
"data.info\n", | |
"\n", | |
"# Stats for each column\n", | |
"data.describe()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Data Wrangling" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Chunking your data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def chunk_data(file,chunksize=100000):\n", | |
" data_iterator = pd.read_csv(\"large_data.csv\", chunksize)\n", | |
" \n", | |
" chunk_list = [] \n", | |
" \n", | |
" # Each chunk is in dataframe format\n", | |
" for data_chunk in data_iterator: \n", | |
" #filtered_chunk = chunk_filtering(data_chunk)\n", | |
" chunk_list.append(data_chunk)\n", | |
" \n", | |
" data = pd.concat(chunk_list)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Changing Data Type" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_csv(\"large_data.csv\", dtype={'column_A': np.int32, 'column_B': np.float16})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def change_dtypes(col_int, col_float, df): \n", | |
" '''\n", | |
" AIM -> Changing dtypes to save memory\n", | |
" \n", | |
" INPUT -> List of column names (int, float), df\n", | |
" \n", | |
" OUTPUT -> updated df with smaller memory \n", | |
" ------\n", | |
" '''\n", | |
" df[col_int] = df[col_int].astype('int32')\n", | |
" df[col_float] = df[col_float].astype('float32')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Restructure strings in columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def remove_col_str(df,col):\n", | |
" # remove a portion of string in a dataframe column - col_1\n", | |
" df[col].replace('[{}]', '', regex=True, inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Remove white space" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def remove_col_white_space(df,col):\n", | |
" '''\n", | |
" remove white space at the beginning of string \n", | |
" '''\n", | |
" df[col] = df[col].str.lstrip()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Rename column names" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def rename_col(df): \n", | |
" '''\n", | |
" AIM -> rename column names\n", | |
" \n", | |
" INPUT -> df\n", | |
" \n", | |
" OUTPUT -> updated df with new column names \n", | |
" ------\n", | |
" '''\n", | |
" df.rename(index=str, columns={'col_1': 'new_col_1','col_2': 'new_col_1'}, inplace=True)\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Convert Dictionary within Column to New Columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def extract_dict(df,col):\n", | |
" \"\"\"\n", | |
" AIM: Extract the dictionaries within a column and create new columns\n", | |
" \"\"\"\n", | |
" return pd.concat([df.drop(col, axis=1), df[col].apply(pd.Series)], axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# If the list or dictionary is string type\n", | |
"\n", | |
"# Turn the string into a list\n", | |
"def string_to_list:\n", | |
" return df[col].str.strip('{}[]').str.split(',')\n", | |
"\n", | |
"# Turn the string to a list and concat them into the dataframe\n", | |
"def string_to_columns:\n", | |
" return pd.concat([df[col].str.strip('{}[]').str.split(',').apply(pd.Series)], axis=1)\n", | |
"\n", | |
"# Turn the string to a list calculate the number of items in each list\n", | |
"def string_list_length:\n", | |
" return df[col].str.strip('{}[]').str.split(',').apply(len)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Save as Excel" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def save_excel(df,index=True):\n", | |
" writer = pd.ExcelWriter('../{}.xlsx'.format(str(df)), engine='xlsxwriter')\n", | |
" df.to_excel(writer, index, sheet_name=str(df))\n", | |
" writer.save()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Concatenate two columns with strings (with condition)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def concat_col_str_condition(df):\n", | |
" # concat 2 columns with strings if the last 3 letters of the first column are 'pil'\n", | |
" mask = df['col_1'].str.endswith('pil', na=False)\n", | |
" col_new = df[mask]['col_1'] + df[mask]['col_2']\n", | |
" col_new.replace('pil', ' ', regex=True, inplace=True) # replace the 'pil' with emtpy space" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## TimeStamp" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Convert timestamp(from string to datetime format)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def convert_str_datetime(df,col,newcol=False): \n", | |
" '''\n", | |
" AIM -> Convert datetime(String) to datetime(format we want)\n", | |
" \n", | |
" INPUT -> df, col, newcol\n", | |
" \n", | |
" OUTPUT -> updated df with new datetime format \n", | |
" ------\n", | |
" '''\n", | |
" if newcol=False:\n", | |
" return df[col]=pd.to_datetime(df.col, format='%Y-%m-%d %H:%M:%S.%f')\n", | |
" else:\n", | |
" return df.concat(column=col, value=pd.to_datetime(df.col, format='%Y-%m-%d %H:%M:%S.%f')) " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Extract the Component TimeStanp" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### DateTimeIndex Method \n", | |
"\n", | |
"[Source](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.html)\n", | |
"\n", | |
"The output format is datetime/timestamp." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def timestamp_comp_id(df,col,attr='year'):\n", | |
" \n", | |
" '''\n", | |
" AIM: To Extract A Part of DateTime/TimeStamp in a column\n", | |
" '''\n", | |
" \n", | |
" if attr=='year':\n", | |
" return pd.DatetimeIndex(df[col]).year\n", | |
" elif attr=='month':\n", | |
" return pd.DatetimeIndex(df[col]).month\n", | |
" elif attr=='day':\n", | |
" return pd.DatetimeIndex(df[col]).day\n", | |
" elif attr=='date':\n", | |
" return pd.DatetimeIndex(df[col]).date\n", | |
" elif attr=='time':\n", | |
" return pd.DatetimeIndex(df[col]).time\n", | |
" else:\n", | |
" print('Sorry the attribute stated is not available!')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### List Compression Method\n", | |
"The output dtype is string." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def timestamp_comp_str(df,col,attr='MMDDYYYY'):\n", | |
" \n", | |
" '''\n", | |
" AIM: To Extract A Part of DateTime/TimeStamp in a column and turn into STRING\n", | |
" '''\n", | |
" \n", | |
" col_new=str(col)+'_'+str(attr)\n", | |
" \n", | |
" if attr=='MMDDYYYY':\n", | |
" df[col_new]=[x.strftime('%m/%d/%Y') for x in df[col]]\n", | |
" elif attr=='Full':\n", | |
" df[col_new]=[x.strftime('%Y-%m-%d %H:%M:%S.%f') for x in df[col]]\n", | |
" elif attr=='time':\n", | |
" df[col_new]=[x.strftime('%H:%M:%S.%f') for x in df[col]]\n", | |
" elif attr=='isoformat':\n", | |
" ## 2015-10-01T00:24:42\n", | |
" df[col_new]=[x.isoformat for x in df[col]] \n", | |
" elif attr=='ctime':\n", | |
" ## Thu Oct 1 00:24:42 2015\n", | |
" df[col_new]=[x.ctime for x in df[col]]\n", | |
" elif attr=='str':\n", | |
" ## 2015-10-01 00:24:42\n", | |
" df[col_new]=[x.__str__() for x in df[col]]\n", | |
" else:\n", | |
" print('Sorry the attribute stated is not available!')\n", | |
" \n", | |
" return df.head(5)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# EDA" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Check Missing Columns and %Missing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Function to calculate missing values by column\n", | |
"def missing_values_table(df):\n", | |
" # Total missing values\n", | |
" mis_val = df.isnull().sum()\n", | |
" \n", | |
" # Percentage of missing values\n", | |
" mis_val_percent = 100 * df.isnull().sum() / len(df)\n", | |
" \n", | |
" # Make a table with the results\n", | |
" mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)\n", | |
" \n", | |
" # Rename the columns\n", | |
" mis_val_table_ren_columns = mis_val_table.rename(\n", | |
" columns = {0 : 'Missing Values', 1 : '% of Total Values'})\n", | |
" \n", | |
" # Sort the table by percentage of missing descending\n", | |
" mis_val_table_ren_columns = mis_val_table_ren_columns[\n", | |
" mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(\n", | |
" '% of Total Values', ascending=False).round(1)\n", | |
" \n", | |
" # Print some summary information\n", | |
" print (\"Your selected dataframe has \" + str(df.shape[1]) + \" columns.\\n\" \n", | |
" \"There are \" + str(mis_val_table_ren_columns.shape[0]) +\n", | |
" \" columns that have missing values.\")\n", | |
" \n", | |
" # Return the dataframe with missing information\n", | |
" return mis_val_table_ren_columns" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Get the columns with > 50% missing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-05-30T15:55:49.926337Z", | |
"start_time": "2019-05-30T15:55:41.903967Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Request: 5; Frequency: 0.6235897396860618 requests/s\n" | |
] | |
} | |
], | |
"source": [ | |
"missing_df = missing_values_table(data);\n", | |
"\n", | |
"def missing_filter(df, threshold=50):\n", | |
" missing_df = missing_values_table(df)\n", | |
" missing_columns = list(missing_df[missing_df['% of Total Values'] > int(threshold)].index)\n", | |
" print('We will remove {} columns.'.format(len(missing_columns)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Drop the missing columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data = data.drop(columns = list(missing_columns))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Column Types Checking" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Number of each type of column\n", | |
"app_train.dtypes.value_counts()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Number of unique classes in each object column\n", | |
"app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)" | |
] | |
} | |
], | |
"metadata": { | |
"@webio": { | |
"lastCommId": "6c81f240e0ca4dd699fafeb5b2662880", | |
"lastKernelId": "07c27537-bc0d-44ed-8043-6b1af3f51b06" | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
}, | |
"latex_envs": { | |
"LaTeX_envs_menu_present": true, | |
"autoclose": false, | |
"autocomplete": true, | |
"bibliofile": "biblio.bib", | |
"cite_by": "apalike", | |
"current_citInitial": 1, | |
"eqLabelWithNumbers": true, | |
"eqNumInitial": 1, | |
"hotkeys": { | |
"equation": "Ctrl-E", | |
"itemize": "Ctrl-I" | |
}, | |
"labels_anchors": false, | |
"latex_user_defs": false, | |
"report_style_numbering": false, | |
"user_envs_cfg": false | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
}, | |
"toc-autonumbering": true, | |
"varInspector": { | |
"cols": { | |
"lenName": 16, | |
"lenType": 16, | |
"lenVar": 40 | |
}, | |
"kernels_config": { | |
"python": { | |
"delete_cmd_postfix": "", | |
"delete_cmd_prefix": "del ", | |
"library": "var_list.py", | |
"varRefreshCmd": "print(var_dic_list())" | |
}, | |
"r": { | |
"delete_cmd_postfix": ") ", | |
"delete_cmd_prefix": "rm(", | |
"library": "var_list.r", | |
"varRefreshCmd": "cat(var_dic_list()) " | |
} | |
}, | |
"types_to_exclude": [ | |
"module", | |
"function", | |
"builtin_function_or_method", | |
"instance", | |
"_Feature" | |
], | |
"window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment