Skip to content

Instantly share code, notes, and snippets.

@ag-ds-bubble
ag-ds-bubble / bp2_gist1.py
Created December 23, 2019 16:51
Blog Post 2 : A glimpse of the final result
models_L1 = { 'XGBRegressor' : True,
'CatBoostRegressor' : True,
'LGBMRegressor' : True,
'GradientBoostingRegressor' : True,
'RandomForestRegressor' : True,
'KNeighborsRegressor': False}
metrics = {'RMSLE':True,
'MAPE':True,
'MSE':True,
@ag-ds-bubble
ag-ds-bubble / bp2_imports.py
Last active December 25, 2019 19:55
Blog Post 2 : Imports
# General
import pandas as pd
import numpy as np
import os, ast
pd.set_option('display.max_colwidth', -1)
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')
import pickle
@ag-ds-bubble
ag-ds-bubble / bp2_path_n_variable.py
Last active December 25, 2019 20:06
Blog Post 2 : Paths & Variable Initializations
####### Path #######
root_path = '../'
data_path = root_path+'Data/'
raw_data_path = data_path+'Raw Data/'
complete_data_path = raw_data_path+'Data.csv'
prepared_data_path = data_path+'Prepared Data/'
notebook_results_path = root_path+'Notebook Results/'
os.makedirs(notebook_results_path, exist_ok=True)
#################VARIABLE INITIALISATIONS#####################
@ag-ds-bubble
ag-ds-bubble / bp2_read.py
Last active December 25, 2019 20:55
Blog Post 2 : Reading, Splitting and having first look at the data.
complete_data = pd.read_csv(complete_data_path, index_col=0,low_memory=False)
print('Complete Data Contains : ', complete_data.shape)
print('Number of Duplicate Indexes in the data : ', complete_data.index.duplicated().sum())
complete_data.head(2)
idx = 0 # Just initialise 'idx' for now, we will use this in the next block
@ag-ds-bubble
ag-ds-bubble / bp2_analyze.py
Last active December 25, 2019 21:07
Blog Post 2 : Reading, Splitting and having first look at the data.
## Analysing the data - You gotta go through each column one by one
## Just keep on pressing control+return on this cell till all the columns have been analysed
## And simultaneously keep on jotting down your analysis in a markdown cell for the same
colAnalysed = complete_data.columns[idx]
print('Column Being Analysed : ', colAnalysed)
helperFunctionsHandler.display_side_by_side([complete_data[[colAnalysed]].sample(7),
complete_data[[colAnalysed]].describe(),
pd.DataFrame(complete_data[colAnalysed].value_counts().sort_values(ascending=False)[:7])],
@ag-ds-bubble
ag-ds-bubble / bp2_analysing_intuition.py
Last active December 25, 2019 21:32
Blog Post 2 : Reading, Splitting and having first look at the data.
# Analysing the Value Counts for YearMade
ax = complete_data.YearMade.value_counts().sort_index().plot(kind='bar')
_=plt.xticks(rotation=70)
_=plt.xlabel('Years',fontsize=20)
_=plt.ylabel('Counts',fontsize=20)
_=plt.title('Number of vehicles by manufacture year',fontsize=30)
plt.savefig(notebook_results_path+'YearCounts.jpg', bbox_inches='tight')
plt.close()
descriptiveDF = pd.DataFrame(complete_data.isna().sum(), columns=['Null Count']).sort_values('Null Count')
@ag-ds-bubble
ag-ds-bubble / bp2_helper_functions.py
Created December 25, 2019 20:16
Blog Post 2 : Helper Functions
class HelperFunctions:
def __init__(self):
print('Initialising the Helper Fucntion Class..')
def display_side_by_side(self, dfs:list, captions:list):
"""Display tables side by side to save vertical space
Input:
dfs: list of pandas.DataFrame
captions: list of table captions
"""
@ag-ds-bubble
ag-ds-bubble / bp2_prepping_data.py
Created December 25, 2019 21:46
Blog Post 2 : Preparing Data
## Columns to be dropped
columns_to_be_removed = ['datasource', 'MachineID']
complete_data.drop(columns_to_be_removed, axis=1, inplace=True)
## Replacing those datapoints in 'YearMade' column which had 1000 as the Year.
complete_data.YearMade = complete_data.YearMade.replace({1000 : -999})
## Convert the saledate column to datetime
complete_data.saledate = pd.to_datetime(complete_data.saledate)
@ag-ds-bubble
ag-ds-bubble / bp2_splitting.py
Created December 25, 2019 22:01
Blog Post 2 : Splitting Data
split_ratio = 0.05
test_datapoints = int(split_ratio * complete_data.shape[0])
train_data = complete_data[:-test_datapoints]
test_data = complete_data[-test_datapoints:]
print('Shape of the Training Data : ', train_data.shape)
print('Shape of the Testing Data : ', test_data.shape)
train_data.to_csv(prepared_data_path+'TrainData.csv')
test_data.to_csv(prepared_data_path+'TestData.csv')
@ag-ds-bubble
ag-ds-bubble / bp2_featureengineering.py
Last active December 26, 2019 20:40
Blog Post 2 : Feature Engineering
train_data_EDA = train_data.copy()
# Calendar Variables
train_data_EDA['sale_year'] = train_data_EDA['saledate'].apply(lambda x : x.year)
train_data_EDA['sale_month'] = train_data_EDA['saledate'].apply(lambda x : x.month)
train_data_EDA['sale_day'] = train_data_EDA['saledate'].apply(lambda x : x.day)
train_data_EDA['sale_day_of_week'] = train_data_EDA['saledate'].apply(lambda x : x.dayofweek)
train_data_EDA['sale_week_number'] = train_data_EDA['saledate'].apply(lambda x : x.week)
# Product Age
train_data_EDA['product_age'] = train_data_EDA.apply(lambda x: x.sale_year - x.YearMade if x.YearMade != -999 else -999, axis=1)