ag-ds-bubble

## bp2_gist1.py
models_L1 = { 'XGBRegressor' : True,
              'CatBoostRegressor' : True,
              'LGBMRegressor' : True,
              'GradientBoostingRegressor' : True,
              'RandomForestRegressor' : True,
              'KNeighborsRegressor': False}

metrics = {'RMSLE':True,
           'MAPE':True,
           'MSE':True,

## bp2_imports.py
# General
import pandas as pd
import numpy as np
import os, ast
pd.set_option('display.max_colwidth', -1)
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')
import pickle

## bp2_path_n_variable.py
####### Path #######
root_path = '../'
data_path = root_path+'Data/'
raw_data_path = data_path+'Raw Data/'
complete_data_path = raw_data_path+'Data.csv'
prepared_data_path = data_path+'Prepared Data/'
notebook_results_path = root_path+'Notebook Results/'
os.makedirs(notebook_results_path, exist_ok=True)

#################VARIABLE INITIALISATIONS#####################

## bp2_read.py
complete_data = pd.read_csv(complete_data_path, index_col=0,low_memory=False)
print('Complete Data Contains : ', complete_data.shape)
print('Number of Duplicate Indexes in the data : ', complete_data.index.duplicated().sum())
complete_data.head(2)
idx = 0 # Just initialise 'idx' for now, we will use this in the next block

## bp2_analyze.py
## Analysing the data - You gotta go through each column one by one
## Just keep on pressing control+return on this cell till all the columns have been analysed
## And simultaneously keep on jotting down your analysis in a markdown cell for the same

colAnalysed = complete_data.columns[idx]
print('Column Being Analysed : ', colAnalysed)

helperFunctionsHandler.display_side_by_side([complete_data[[colAnalysed]].sample(7),
                                             complete_data[[colAnalysed]].describe(),
                                             pd.DataFrame(complete_data[colAnalysed].value_counts().sort_values(ascending=False)[:7])],

## bp2_analysing_intuition.py
# Analysing the Value Counts for YearMade
ax = complete_data.YearMade.value_counts().sort_index().plot(kind='bar')
_=plt.xticks(rotation=70)
_=plt.xlabel('Years',fontsize=20)
_=plt.ylabel('Counts',fontsize=20)
_=plt.title('Number of vehicles by manufacture year',fontsize=30)
plt.savefig(notebook_results_path+'YearCounts.jpg', bbox_inches='tight')
plt.close()

descriptiveDF = pd.DataFrame(complete_data.isna().sum(), columns=['Null Count']).sort_values('Null Count')

## bp2_helper_functions.py
class HelperFunctions:
    def __init__(self):
        print('Initialising the Helper Fucntion Class..')

    def display_side_by_side(self, dfs:list, captions:list):
        """Display tables side by side to save vertical space
        Input:
            dfs: list of pandas.DataFrame
            captions: list of table captions
        """

## bp2_prepping_data.py
## Columns to be dropped
columns_to_be_removed = ['datasource', 'MachineID']
complete_data.drop(columns_to_be_removed, axis=1, inplace=True)

## Replacing those datapoints in 'YearMade' column which had 1000 as the Year.
complete_data.YearMade = complete_data.YearMade.replace({1000 : -999})

## Convert the saledate column to datetime
complete_data.saledate = pd.to_datetime(complete_data.saledate)

## bp2_splitting.py
split_ratio = 0.05
test_datapoints  = int(split_ratio * complete_data.shape[0])
train_data = complete_data[:-test_datapoints]
test_data = complete_data[-test_datapoints:]
print('Shape of the Training Data : ', train_data.shape)
print('Shape of the Testing Data : ', test_data.shape)

train_data.to_csv(prepared_data_path+'TrainData.csv')
test_data.to_csv(prepared_data_path+'TestData.csv')

## bp2_featureengineering.py
train_data_EDA = train_data.copy()

# Calendar Variables
train_data_EDA['sale_year'] = train_data_EDA['saledate'].apply(lambda x :  x.year)
train_data_EDA['sale_month'] = train_data_EDA['saledate'].apply(lambda x :  x.month)
train_data_EDA['sale_day'] = train_data_EDA['saledate'].apply(lambda x :  x.day)
train_data_EDA['sale_day_of_week'] = train_data_EDA['saledate'].apply(lambda x :  x.dayofweek)
train_data_EDA['sale_week_number'] = train_data_EDA['saledate'].apply(lambda x :  x.week)
# Product Age
train_data_EDA['product_age'] = train_data_EDA.apply(lambda x: x.sale_year - x.YearMade if x.YearMade != -999 else -999, axis=1)
	models_L1 = { 'XGBRegressor' : True,
	'CatBoostRegressor' : True,
	'LGBMRegressor' : True,
	'GradientBoostingRegressor' : True,
	'RandomForestRegressor' : True,
	'KNeighborsRegressor': False}

	metrics = {'RMSLE':True,
	'MAPE':True,
	'MSE':True,
	# General
	import pandas as pd
	import numpy as np
	import os, ast
	pd.set_option('display.max_colwidth', -1)
	from tqdm import tqdm_notebook
	import warnings
	warnings.filterwarnings('ignore')
	import pickle
	####### Path #######
	root_path = '../'
	data_path = root_path+'Data/'
	raw_data_path = data_path+'Raw Data/'
	complete_data_path = raw_data_path+'Data.csv'
	prepared_data_path = data_path+'Prepared Data/'
	notebook_results_path = root_path+'Notebook Results/'
	os.makedirs(notebook_results_path, exist_ok=True)

	#################VARIABLE INITIALISATIONS#####################
	complete_data = pd.read_csv(complete_data_path, index_col=0,low_memory=False)
	print('Complete Data Contains : ', complete_data.shape)
	print('Number of Duplicate Indexes in the data : ', complete_data.index.duplicated().sum())
	complete_data.head(2)
	idx = 0 # Just initialise 'idx' for now, we will use this in the next block
	## Analysing the data - You gotta go through each column one by one
	## Just keep on pressing control+return on this cell till all the columns have been analysed
	## And simultaneously keep on jotting down your analysis in a markdown cell for the same

	colAnalysed = complete_data.columns[idx]
	print('Column Being Analysed : ', colAnalysed)

	helperFunctionsHandler.display_side_by_side([complete_data[[colAnalysed]].sample(7),
	complete_data[[colAnalysed]].describe(),
	pd.DataFrame(complete_data[colAnalysed].value_counts().sort_values(ascending=False)[:7])],
	# Analysing the Value Counts for YearMade
	ax = complete_data.YearMade.value_counts().sort_index().plot(kind='bar')
	_=plt.xticks(rotation=70)
	_=plt.xlabel('Years',fontsize=20)
	_=plt.ylabel('Counts',fontsize=20)
	_=plt.title('Number of vehicles by manufacture year',fontsize=30)
	plt.savefig(notebook_results_path+'YearCounts.jpg', bbox_inches='tight')
	plt.close()

	descriptiveDF = pd.DataFrame(complete_data.isna().sum(), columns=['Null Count']).sort_values('Null Count')
	class HelperFunctions:
	def __init__(self):
	print('Initialising the Helper Fucntion Class..')

	def display_side_by_side(self, dfs:list, captions:list):
	"""Display tables side by side to save vertical space
	Input:
	dfs: list of pandas.DataFrame
	captions: list of table captions
	"""
	## Columns to be dropped
	columns_to_be_removed = ['datasource', 'MachineID']
	complete_data.drop(columns_to_be_removed, axis=1, inplace=True)

	## Replacing those datapoints in 'YearMade' column which had 1000 as the Year.
	complete_data.YearMade = complete_data.YearMade.replace({1000 : -999})

	## Convert the saledate column to datetime
	complete_data.saledate = pd.to_datetime(complete_data.saledate)
	split_ratio = 0.05
	test_datapoints = int(split_ratio * complete_data.shape[0])
	train_data = complete_data[:-test_datapoints]
	test_data = complete_data[-test_datapoints:]
	print('Shape of the Training Data : ', train_data.shape)
	print('Shape of the Testing Data : ', test_data.shape)

	train_data.to_csv(prepared_data_path+'TrainData.csv')
	test_data.to_csv(prepared_data_path+'TestData.csv')
	train_data_EDA = train_data.copy()

	# Calendar Variables
	train_data_EDA['sale_year'] = train_data_EDA['saledate'].apply(lambda x : x.year)
	train_data_EDA['sale_month'] = train_data_EDA['saledate'].apply(lambda x : x.month)
	train_data_EDA['sale_day'] = train_data_EDA['saledate'].apply(lambda x : x.day)
	train_data_EDA['sale_day_of_week'] = train_data_EDA['saledate'].apply(lambda x : x.dayofweek)
	train_data_EDA['sale_week_number'] = train_data_EDA['saledate'].apply(lambda x : x.week)
	# Product Age
	train_data_EDA['product_age'] = train_data_EDA.apply(lambda x: x.sale_year - x.YearMade if x.YearMade != -999 else -999, axis=1)