Skip to content

Instantly share code, notes, and snippets.

@almerr
Created April 15, 2023 05:37
Show Gist options
  • Save almerr/ebfc2de613d677b233a7b85be48e3d49 to your computer and use it in GitHub Desktop.
Save almerr/ebfc2de613d677b233a7b85be48e3d49 to your computer and use it in GitHub Desktop.
# **To import libraries and packages**
"""
# Commented out IPython magic to ensure Python compatibility.
# Import necessary librairies
import gc
import numpy as np # linear algebra
from numpy import hstack
from numpy import array
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
from scipy import stats
from scipy.stats import norm, skew #for some statistics
# Definitions
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
pd.set_option('display.max_columns', 100) #Display upto 100 columns
pd.set_option('display.max_rows', 100)
# %matplotlib inline
# Check the files available in the directory
import os
for dirname, _, filenames in os.walk('../input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
!pip install scikit-learn==1.0.0
# Confirm sklearn version
from sklearn import __version__
__version__
from google.colab import drive
drive.mount('/content/drive')
#https://drive.google.com/drive/my-drive
#course_path = "/content/drive/My Drive/mnist/"
course_path = "/content/drive/MyDrive/Colab Notebooks/"
# To import datasets
#train_2016 = pd.read_csv('/content/drive/MyDrive/zillow/train_2016_v2.csv/train_2016_v2.csv', low_memory=False)
train_2016 = pd.read_csv(course_path + 'train_2016_v2.csv', low_memory=False)
#properties_2016, NAlist_2016 = reduce_mem_usage(pd.read_csv('/content/drive/MyDrive/zillow/properties_2016.csv/properties_2016.csv',low_memory=False))
#properties_2016 = pd.read_csv('/content/drive/MyDrive/zillow/properties_2016.csv/properties_2016.csv',low_memory=False)
properties_2016 = pd.read_csv(course_path + 'properties_2016.csv',low_memory=False)
#train_2017 = pd.read_csv('/content/drive/MyDrive/zillow/train_2017.csv/train_2017.csv',low_memory=False)
train_2017 = pd.read_csv(course_path +'train_2017.csv',low_memory=False)
#properties_2017, NAlist_2017 = reduce_mem_usage(pd.read_csv('/content/drive/MyDrive/zillow/properties_2017.csv/properties_2017.csv',low_memory=False))
#properties_2017= pd.read_csv('/content/drive/MyDrive/zillow/properties_2017.csv/properties_2017.csv',low_memory=False)
properties_2017= pd.read_csv(course_path + 'properties_2017.csv',low_memory=False)
"""# **2. To explore datasets**"""
data_manual = pd.read_excel(course_path+'zillow_data_dictionary.xlsx')
data_manual.head()
# to inspect dataset
#unique_train2016 = unique_train2016.sort_values(by='parcelid')
properties_2016=properties_2016.sort_values(by='parcelid')
properties_2016.head(15)
"""# To import datasets"""
import os
#HOUSING_PATH = os.path.join(course_path)
#PROPERTIES_2016 = 'properties_2016.csv'
#PROPERTIES_2017 = 'properties_2017.csv'
#TRAIN_2016 = 'train_2016_v2.csv'
#TRAIN_2017 = 'train_2017.csv'
#def load_housing_data(housing_path=HOUSING_PATH):
def load_housing_data():
#properties_2016 = pd.read_csv(os.path.join(housing_path, PROPERTIES_2016))
#properties_2017 = pd.read_csv(os.path.join(housing_path, PROPERTIES_2017))
#train_2016 = pd.read_csv(os.path.join(housing_path, TRAIN_2016))
#train_2017 = pd.read_csv(os.path.join(housing_path, TRAIN_2017))
# Left join will ignore all properties that do not have a logerror (target variable) associated with them
train_2016 = pd.merge(train_2016, properties_2016, how = 'left', on = 'parcelid')
train_2017 = pd.merge(train_2017, properties_2017, how = 'left', on = 'parcelid')
# Union data for 2016 and 2017 into one dataframe
all_properties = pd.concat([properties_2016, properties_2017], ignore_index=True)
all_training = pd.concat([train_2016, train_2017], ignore_index=True)
return all_properties, all_training
train_2016 = pd.merge(train_2016, properties_2016, how = 'left', on = 'parcelid')
train_2017 = pd.merge(train_2017, properties_2017, how = 'left', on = 'parcelid')
# Union data for 2016 and 2017 into one dataframe
all_properties = pd.concat([properties_2016, properties_2017], ignore_index=True)
all_training = pd.concat([train_2016, train_2017], ignore_index=True)
#all_properties, housing = load_housing_data()
housing=all_training
housing.head()
print("Total Properties Shape: {}".format(all_properties.shape))
print("-"*50)
housing.info()
# Check for and drop duplicates in training dataset
def check_duplicates(housing):
idsUnique = len(housing[['parcelid', 'transactiondate']].value_counts())
idsTotal = housing.shape[0]
idsDupli = idsTotal - idsUnique
print("There are " + str(idsDupli) + " duplicate IDs for " + str(idsTotal) + " total entries")
def drop_duplicates(housing):
# Drop all duplicate entries which have the same parcelID and Transaction Date
print("Dropping all duplicates based on parcelid and transactiondate...")
return housing.drop_duplicates(subset=['parcelid', 'transactiondate'], keep='last', ignore_index=True)
# Check for and drop duplicates
check_duplicates(housing)
housing = drop_duplicates(housing)
# Validate
check_duplicates(housing)
"""## Target Variable
Target Variable : `logerror`
"""
y = housing.logerror
y.hist(bins=100, figsize=(8,5))
plt.show()
y.describe()
sns.distplot(y , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(y)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('LogError distribution')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(y, plot=plt)
plt.show()
"""### Dropping Outliers
To drop all outliers that are more than 2.5 standard deviations away from the mean.
"""
highest_thres = y.mean() + 2.5*y.std()
lowest_thres = y.mean() - 2.5*y.std()
print("Highest allowed",highest_thres)
print("Lowest allowed", lowest_thres)
# Only the training set outliers will be dropped (not validation or testing set to ensure model performs well on outliers too)
y = y[y > lowest_thres]
y = y[y < highest_thres]
# Update original Housing dataframe
housing = housing[housing.logerror > lowest_thres]
housing = housing[housing.logerror < highest_thres]
# Drop rows containing either 75% or more NaN Values
percent = 75.0
min_count = int(((100-percent)/100)*housing.shape[1] + 1)
housing = housing.dropna(axis=0, thresh=min_count)
housing.shape
y.hist(bins=100, figsize=(8,5))
plt.show()
#Check the new distribution
sns.distplot(y , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(y)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('LogError distribution')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(y, plot=plt)
plt.show()
"""```
# This is formatted as code
```
The skew seems now corrected and the data appears better normally distributed.
# Create Test / Train Datasets
"""
from zlib import crc32
from sklearn.model_selection import train_test_split
def test_set_check(identifier, test_ratio):
return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32
def split_train_test_by_id(data, test_ratio, id_column):
ids = data[id_column]
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
train_set = data.loc[~in_test_set]
test_set = data.loc[in_test_set]
X_train = train_set.drop("logerror", axis=1)
y_train = train_set["logerror"].copy()
X_test = test_set.drop("logerror", axis=1)
y_test = test_set["logerror"].copy()
return X_train, X_test, y_train, y_test
X_other, X_test, y_other, y_test = split_train_test_by_id(housing, 0.1, "parcelid")
print(f"Other Dataset Shape: {X_other.shape}; Test Dataset Shape: {X_test.shape}")
# Split X_other into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_other, y_other, train_size=0.9, random_state=42)
print(f"Training Dataset Shape: {X_train.shape}") # 81% of instances are in training
print(f"Test Dataset Shape: {X_test.shape}") # 10% of instances are in test
print(f"Validation Dataset Shape: {X_val.shape}") # 9% of instances are in validation
# Clear memory
del all_properties, housing; gc.collect()
"""# **Data Preprocessing Pipelines**
### Missing Data
"""
all_data_na = (X_train.isnull().sum() / len(X_train)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data[:35]
print ("Features with one unique value!!")
exclude_unique = []
for c in X_train.columns:
num_uniques = len(X_train[c].unique())
if X_train[c].isnull().sum() != 0:
num_uniques -= 1
if num_uniques == 1:
exclude_unique.append(c)
print(exclude_unique)
# Training data copy to test individual pipelines
X_temp = X_train.copy()
"""## Dropping Features Pipeline"""
from sklearn.base import BaseEstimator, TransformerMixin
class FeatureDropper(BaseEstimator, TransformerMixin):
def __init__(self, features_to_drop):
self.features_to_drop = features_to_drop
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X):
updated_X = X.drop(self.features_to_drop, axis=1)
return updated_X
# Drop features
lin_reg_drop_vars = ["finishedsquarefeet13", "finishedsquarefeet15", "finishedfloor1squarefeet", "finishedsquarefeet50",
"storytypeid", "architecturalstyletypeid", "buildingclasstypeid", "typeconstructiontypeid", "finishedsquarefeet6",
"pooltypeid10", "pooltypeid7", "hashottuborspa", "fireplaceflag", "threequarterbathnbr", "calculatedbathnbr",
"fullbathcnt", "numberofstories", "rawcensustractandblock", "censustractandblock",
"finishedsquarefeet12", "taxvaluedollarcnt", "taxamount", "assessmentyear", "roomcnt",
"propertyzoningdesc", "regionidneighborhood", "regionidzip", "taxdelinquencyyear",
"propertycountylandusecode", "regionidcity", "parcelid", "basementsqft", "yardbuildingsqft26", "transactiondate"
]
# Sample code to test pipeline
feat_dropper = FeatureDropper(features_to_drop=lin_reg_drop_vars)
X_temp = feat_dropper.fit_transform(X_temp)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
# Impute 0
impute_0_vars = ["yardbuildingsqft17", "fireplacecnt", "poolcnt", "garagecarcnt", "garagetotalsqft",
"pooltypeid2", "poolsizesum", "decktypeid", "taxdelinquencyflag"]
# Impute mode
impute_mode_vars = ["airconditioningtypeid", "heatingorsystemtypeid", "unitcnt", "fips",
"propertylandusetypeid", "regionidcounty", "yearbuilt"]
# Impute median
impute_median_vars = ["buildingqualitytypeid", "lotsizesquarefeet", "bathroomcnt", "bedroomcnt", "calculatedfinishedsquarefeet",
"structuretaxvaluedollarcnt", "landtaxvaluedollarcnt", "latitude", "longitude"]
univariate_impute_pipe = ColumnTransformer([
("impute_0", SimpleImputer(strategy="constant", fill_value=0), impute_0_vars),
("impute_mode", SimpleImputer(strategy="most_frequent"), impute_mode_vars),
("impute_median", SimpleImputer(strategy="median"), impute_median_vars),
],
remainder='passthrough'
)
# Sample code to test pipeline [ONLY RUN ONE OF UNIVARIATE OR MULTIVARIATE PIPELINES]
X_temp = univariate_impute_pipe.fit_transform(X_temp)
"""## Multivariate Imputation Pipeline
We will be using a `RandomForestRegressor` as the estimator to predict the missing values in each feature. For details on `Random Forest` algorithm and tuning its hyperparameters, refer to modeling section later.
"""
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
cat_impute_vars = ["airconditioningtypeid", "heatingorsystemtypeid", "fips", "propertylandusetypeid", "regionidcounty", "pooltypeid2", "decktypeid", "taxdelinquencyflag"]
numeric_impute_vars = ["bathroomcnt", "bedroomcnt", "buildingqualitytypeid", "calculatedfinishedsquarefeet",
"fireplacecnt", "garagecarcnt", "garagetotalsqft", "latitude", "longitude", "lotsizesquarefeet", "poolcnt",
"poolsizesum", "unitcnt", "yardbuildingsqft17", "yearbuilt", "structuretaxvaluedollarcnt", "landtaxvaluedollarcnt"]
multivariate_impute_pipe = ColumnTransformer([
("impute_cats", SimpleImputer(strategy="constant", fill_value='missing'), cat_impute_vars),
("impute_num", IterativeImputer(estimator=RandomForestRegressor(n_estimators=1, max_depth=30, min_samples_leaf=32), random_state=0, max_iter=1), numeric_impute_vars),
],
remainder='passthrough'
)
# Sample code to test pipeline [ONLY RUN ONE OF UNIVARIATE OR MULTIVARIATE PIPELINES]
# X_temp = multivariate_impute_pipe.fit_transform(X_temp)
"""## Column Names Appender Pipeline
The output of the imputation pipeline is not a Pandas DataFrame but a NumPy Array. The following pipeline takes as an input the imputation pipeline and creates a DataFrame from the Numpy Array input.
"""
class ColumnNamesAppender(BaseEstimator, TransformerMixin):
"""
Takes a Column Transformer pipeline as an input along with the Numpy Array
to output the DataFrame with column names appended to it.
"""
def __init__(self, column_transformer, orig_columns, num_transformers):
self.column_transformer = column_transformer
self.orig_columns = orig_columns
self.num_transformers = num_transformers
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X):
X_column_names = self.get_columns_from_transformer(self.column_transformer, self.orig_columns, self.num_transformers)
# Create dataframe from numpy array and column names
X = pd.DataFrame(X, columns=X_column_names)
return X
@staticmethod
def get_columns_from_transformer(column_transformer, input_colums, num_transformers):
col_name = []
for transformer_in_columns in column_transformer.transformers_: #the last transformer is ColumnTransformer's 'remainder'
raw_col_name = transformer_in_columns[2]
if isinstance(transformer_in_columns[1],Pipeline):
transformer = transformer_in_columns[1].steps[-1][1]
else:
transformer = transformer_in_columns[1]
try:
names = transformer.get_feature_names([raw_col_name])
except AttributeError: # if no 'get_feature_names' function, use raw column name
names = raw_col_name
if isinstance(names,np.ndarray):
col_name += names.tolist()
elif isinstance(names,list):
col_name += names
elif isinstance(names,str):
col_name.append(names)
return col_name
# Code to test pipeline (using imputation pipeline as sample ColumnTransformer pipeline)
# TODO: Currently unable to handle columns that are passed through remainder
column_appender = ColumnNamesAppender(univariate_impute_pipe, orig_columns=X_train.columns, num_transformers=3)
X_temp = column_appender.fit_transform(X_temp)
# Check if still missing values
all_data_na = (X_temp.isnull().sum() / len(X_temp)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:40]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head
"""## Convert Variables Types Pipeline
This section changes the feature data type to a more appropriate type. This is because many categorical and boolean variables are currently encoded as floats.
"""
convert_to_int = ["yearbuilt"]
convert_to_string= ["airconditioningtypeid", "heatingorsystemtypeid", "fips", "propertylandusetypeid", "regionidcounty", "pooltypeid2", "decktypeid", "taxdelinquencyflag"]
convert_to_float= ["bathroomcnt", "bedroomcnt", "buildingqualitytypeid", "calculatedfinishedsquarefeet",
"fireplacecnt", "garagecarcnt", "garagetotalsqft", "latitude", "longitude", "lotsizesquarefeet", "poolcnt",
"poolsizesum", "unitcnt", "yardbuildingsqft17",
"structuretaxvaluedollarcnt", "landtaxvaluedollarcnt"]
class ConvertFeatureType(BaseEstimator, TransformerMixin):
def __init__(self, convert_to_int=[], convert_to_bool=[], convert_to_string=[], convert_to_float=[]):
self.convert_to_int = convert_to_int
self.convert_to_bool = convert_to_bool
self.convert_to_string = convert_to_string
self.convert_to_float = convert_to_float
self.features = {"int": convert_to_int, "float": convert_to_float, "boolean": convert_to_bool, "str": convert_to_string}
def fit(self, X, y=None):
return self # Nothing else to do
def transform(self, X):
self.map_bool_features(X)
for data_type in self.features.keys():
X = self.convert_feature_types(X, data_type)
return X
def map_bool_features(self, X):
"""Convert all non null values to True in bool features prior to changing type to Boolean."""
for var in self.convert_to_bool:
X[var][X[var].notnull()] = True
def convert_feature_types(self, X, data_type):
for var in self.features[data_type]:
X[var] = X[var].astype(data_type)
return X
# # Code to test pipeline
feature_type_changer = ConvertFeatureType(convert_to_int=convert_to_int, convert_to_string=convert_to_string, convert_to_float=convert_to_float)
X_temp = feature_type_changer.fit_transform(X_temp)
class ConvertToType(BaseEstimator, TransformerMixin):
'''
Variation of pipeline above to convert ALL colummns to specific type.
Handy for algorithms such as XGBoost which expect all features to not be object type.
'''
def __init__(self, var_type, vars_to_convert=None):
self.var_type = var_type
self.vars_to_convert = vars_to_convert
def fit(self, X, y=None):
return self # Nothing else to do
def transform(self, X):
if self.vars_to_convert:
for col in self.vars_to_convert:
X[col] = X[col].astype(self.var_type)
else:
for col in X.columns:
X[col] = X[col].astype(self.var_type)
return X
# Code to test pipeline
# convert_to_float = ConvertToType(var_type='float', vars_to_convert=sample_vars)
# X_temp = convert_to_float.fit_transform(X_temp)
"""## Data Feature Creator Pipeline
This pipeline creates simple date features by extracting the information from `transactiondate`
"""
class CreateDateFeatures(BaseEstimator, TransformerMixin):
"""
Creates simple date features by extracting the information from `transactiondate`
"""
def __init__(self):
pass
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X):
dt = pd.to_datetime(X['transactiondate']).dt
X['transaction_year'] = (dt.year).astype('category')
X['transaction_month'] = ((dt.year - 2016)*12 + dt.month).astype('category')
X['transaction_day'] = dt.day
X['transaction_quarter'] = ((dt.year - 2016)*4 + dt.quarter).astype('category')
X = X.drop(['transactiondate'], axis=1)
return X
# Code to test pipeline
# date_feat_creator = CreateDateFeatures()
# X_temp = date_feat_creator.fit_transform(X_temp)
"""## Year Feature Creation Pipeline
The following transformer takes datetime features as input skewed_featsand converts them into years from present. For example: `1989` is converted into `present_year(2021) - 1989 = 32`
"""
from datetime import date
class CreateYearFeatures(BaseEstimator, TransformerMixin):
"""
Creates new features converting dates into years from present.
Eg: 1989 is converted into present_year(2021) - 1989 which is 32.
"""
def __init__(self, date_features):
self.date_features = date_features
self.current_year = date.today().year
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X):
for var in self.date_features.keys():
new_var_name = self.date_features[var]
X[new_var_name] = self.current_year - X[var]
X[new_var_name] = X[new_var_name].astype('float')
# Drop old feature
X.drop(var, axis=1, inplace=True)
return X
# Date features
date_features = {"yearbuilt": "house_age"}
# Code to test pipeline
year_feat_creator = CreateYearFeatures(date_features=date_features)
X_temp = year_feat_creator.fit_transform(X_temp)
"""## Combining Existing Features Pipeline
The following transformer creates new derived variables by combining existing variables
"""
class CreateDerivedFeatures(BaseEstimator, TransformerMixin):
"""
Creates new features by combining existing variables
"""
def __init__(self):
return None # nothing else to do
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X):
# Average Size Features
X['avg_garage_size'] = X['garagetotalsqft'] / X['garagecarcnt']
X['property_tax_per_sqft'] = X['taxamount'] / X['calculatedfinishedsquarefeet']
# Average area in sqft per room
mask = (X.roomcnt >= 1) # avoid dividing by zero
X.loc[mask, 'avg_area_per_room'] = X.loc[mask, 'calculatedfinishedsquarefeet'] / X.loc[mask, 'roomcnt']
# Derived Room Count
X['derived_room_cnt'] = X['bedroomcnt'] + X['bathroomcnt']
# Use the derived room_cnt to calculate the avg area again
mask = (X.derived_room_cnt >= 1)
X.loc[mask,'derived_avg_area_per_room'] = X.loc[mask,'calculatedfinishedsquarefeet'] / X.loc[mask,'derived_room_cnt']
# Rotated Coordinates
X['location_1'] = X['latitude'] + X['longitude']
X['location_2'] = X['latitude'] - X['longitude']
X['location_3'] = X['latitude'] + 0.5 * X['longitude']
X['location_4'] = X['latitude'] - 0.5 * X['longitude']
# 'finished_area_sqft' and 'total_area' cover only a strict subset of 'finished_area_sqft_calc' in terms of
# non-missing values. Also, when both fields are not null, the values are always the same.
# So we can probably drop 'finished_area_sqft' and 'total_area' since they are redundant
# If there're some patterns in when the values are missing, we can add two isMissing binary features
X['missing_finished_area'] = X['finishedsquarefeet12'].isnull().astype(float)
X['missing_total_area'] = X['finishedsquarefeet15'].isnull().astype(float)
X = X.drop(['finishedsquarefeet12', 'finishedsquarefeet15'], axis=1)
X['missing_bathroom_cnt_calc'] = X['calculatedbathnbr'].isnull().astype(float)
X = X.drop(['calculatedbathnbr'], axis=1)
return X
# Code to test pipeline
# derived_feat_creator = CreateDerivedFeatures()
# X_temp = derived_feat_creator.fit_transform(X_temp)
"""## Create Aggregated Features Pipeline
The following pipeline creates aggregated features such as aggregations for a specific zip code.
"""
class CreateAggregatedFeatures(BaseEstimator, TransformerMixin):
"""
Creates new features by combining existing variables
"""
def __init__(self, group_col, agg_cols):
self.group_col = group_col
self.agg_cols = agg_cols
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X):
group_col = self.group_col
X[group_col + '-groupcnt'] = X[group_col].map(X[group_col].value_counts())
new_columns = [] # New feature columns added to the DataFrame
for col in self.agg_cols:
aggregates = X.groupby(group_col, as_index=False)[col].agg([np.mean])
aggregates.columns = [group_col + '-' + col + '-' + s for s in ['mean']]
new_columns += list(aggregates.columns)
X = X.merge(how='left', right=aggregates, on=group_col)
for col in self.agg_cols:
mean = X[group_col + '-' + col + '-mean']
diff = X[col] - mean
X[group_col + '-' + col + '-' + 'diff'] = diff
if col != 'yearbuilt':
X[group_col + '-' + col + '-' + 'percent'] = diff / mean
# Set the values of the new features to NaN if the groupcnt is too small (prevent overfitting)
threshold = 100
X[new_columns] = X.loc[X[group_col + '-groupcnt'] < threshold, new_columns] = np.nan
# Drop the mean features which are not as useful
X = X.drop([group_col+'-'+col+'-mean' for col in self.agg_cols], axis=1)
return X
# Code to test pipeline
# aggregated_feat_creator = CreateAggregatedFeatures(group_col=group_col, agg_cols=agg_cols)
# X_temp = aggregated_feat_creator.fit_transform(X_temp)
"""## One-Hot Encoding Categorical Variables + Standardizing Numerical Variables
The following custom transformer uses `ColumnTransformer` to perform One-Hot Encoding on Categorical Features and Robust Scaler on Numerical Features.
"""
class FeatureEncoderAndScaler(BaseEstimator, TransformerMixin):
def __init__(self, features_to_encode=None, features_to_scale=None, numeric_types=["float"]):
self.features_to_encode = features_to_encode
self.features_to_scale = features_to_scale
self.numeric_types = numeric_types
self.feature_encoder_and_scaler = None
def fit(self, X, y=None):
if not self.features_to_encode:
self.features_to_encode = X.select_dtypes(include = ["object"]).columns
if not self.features_to_scale:
self.features_to_scale = X.select_dtypes(include = self.numeric_types).columns
feature_encoder_scaler = ColumnTransformer([
("ohe_cats", OneHotEncoder(handle_unknown='ignore', sparse=False), self.features_to_encode),
("num_scaler", RobustScaler(), self.features_to_scale),
],
remainder='passthrough',
# verbose_feature_names_out='False' # To turn off prefixing of transformer name to feature
)
self.feature_encoder_scaler = feature_encoder_scaler.fit(X)
return self
def transform(self, X):
# OneHotEncoder returns numpy array which is converted to dataframe
X_np = self.feature_encoder_scaler.transform(X)
X = pd.DataFrame(
X_np,
columns=self.feature_encoder_scaler.get_feature_names_out()
)
X = self.convert_feature_types(X)
return X
def convert_feature_types(self, X):
"""Convert feature types to object, float, bool based on the column name.
Columns with `ohe_cats` are object, `num_scaler` are float, `remainder` are bool"""
for column in X:
if 'ohe_cats' in column:
X[column] = X[column].astype("object")
elif 'num_scaler' in column:
X[column] = X[column].astype("float")
elif 'remainder' in column:
X[column] = X[column].astype("boolean")
return X
# Code to test pipeline
# cat_features = X_test.select_dtypes(include = ["object"]).columns # optional as encoder automatically detects relevant features
# feature_encoder_scaler = FeatureEncoderAndScaler(features_to_encode=cat_features)
feature_encoder_scaler = FeatureEncoderAndScaler()
X_temp = feature_encoder_scaler.fit_transform(X_temp)
# Sample plot histogram to check standardized variable
X_temp.num_scaler__calculatedfinishedsquarefeet.hist(bins=50, figsize=(8,5))
plt.show()
#Correlation map to see how features are correlated with SalePrice
corrmat = X_temp.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
"""## Create Polynomial Features Pipeline
To better detect higher degree relations between the target variable and predictors, we'll find the most important features relative to the target and create 3 new polynomial variables for each of the top 8 existing features (out of the 15 numerical features). The three new polynomial features are:
- `feature^2`
- `feature^3`
- `sqrt(feature)`
This transformation is useful for models such as linear regression compared to more complex models such as Random Forest, GBMs that can detect non-linear patters within the data without such feature engineering.
"""
# Find most important features relative to target (Take absolute value)
print("Find most important features relative to target")
corr_df = X_temp.copy()
corr_df["logerror"] = y_train.values
corr = corr_df.corr()
most_corr_feat = corr.logerror.abs().sort_values(ascending=False)[1:9].index
most_corr_feat
class CreatePolynomialFeatures(BaseEstimator, TransformerMixin):
"""
Creates new polynomial features using the 10 most important features relative to the target.
3 new polynomial variables for each of the existing features: squared, cubed, sqrt.
"""
def __init__(self, most_imp_feat):
self.most_imp_feat = most_imp_feat
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X):
for var in self.most_imp_feat:
# New var names
s2_var_name = var + '-s2'
s3_var_name = var + '-s3'
sq_var_name = var + '-sqrt'
# Create features
X[s2_var_name] = X[var] ** 2
X[s3_var_name] = X[var] ** 3
X[sq_var_name] = np.sqrt(X[var] + abs(min(X[var]))) # Translate feature to ensure min value is 0 before sqrt
return X
# Code to test pipeline
poly_feat_creator = CreatePolynomialFeatures(most_corr_feat)
X_temp = poly_feat_creator.fit_transform(X_temp)
# Code to detect any skewed features
numeric_feats = X_temp.dtypes[X_temp.dtypes == 'float'].index
skewed_feats = X_temp[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewed_feats[:20]
X_temp.num_scaler__lotsizesquarefeet.hist(bins=10, figsize=(8,5))
plt.show()
from datetime import date
from scipy.special import boxcox1p
class BoxCoxSkewedFeatures(BaseEstimator, TransformerMixin):
"""
Performs Box-Cox tranformation on all numerical variables with skewness
above a certain threshold.
"""
def __init__(self, skewness_thres=0.75):
self.skewness_thres = skewness_thres
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X):
numeric_feats = X.dtypes[X.dtypes == 'float'].index
skewed_feats = X[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewed_feats = skewed_feats[abs(skewed_feats) > self.skewness_thres].index
# Apply box-cox to each variable
lam = 0.18
for feat in skewed_feats:
X[feat] = X[feat] + abs(min(X[feat])) # Translate feature to ensure minimum value is 0
X[feat] = boxcox1p(X[feat], lam)
return X
# # Code to test pipeline
skew_transformer = BoxCoxSkewedFeatures()
X_temp = skew_transformer.transform(X_temp)
# Check skewed features after box-cox transformation
numeric_feats = X_temp.dtypes[X_temp.dtypes == 'float'].index
skewed_feats = X_temp[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewed_feats[:20]
X_temp.num_scaler__lotsizesquarefeet.hist(bins=10, figsize=(8,5))
plt.show()
X_temp.head()
# Clear memory
del X_temp; gc.collect()
X_prepared = X_train.copy()
X_prepared_val = X_val.copy()
# Feature Dropper Pipeline
feature_dropper = FeatureDropper(features_to_drop=lin_reg_drop_vars)
# Convert Date Features Pipeline
year_feat_creator = CreateYearFeatures(date_features=date_features)
# Feature Encoding and Scaling Pipeline
feature_encoder_scaler = FeatureEncoderAndScaler()
# Transform Skewed Numerical Features Pipeline
skew_transformer = BoxCoxSkewedFeatures()
# Two versions below: Univariate Imputation and Multivariate Imputation - ONLY UNCOMMENT ONE SECTION
###########################################
# 1) Univariate Imputation Pipeline
convert_to_bool = ["pooltypeid2", "decktypeid", "taxdelinquencyflag"]
convert_to_string= ["airconditioningtypeid", "heatingorsystemtypeid", "fips", "propertylandusetypeid", "regionidcounty"]
univariate_impute_pipe = ColumnTransformer([
("impute_0", SimpleImputer(strategy="constant", fill_value=0), impute_0_vars),
("impute_mode", SimpleImputer(strategy="most_frequent"), impute_mode_vars),
("impute_median", SimpleImputer(strategy="median"), impute_median_vars),
],
remainder='passthrough'
)
col_name_appender = ColumnNamesAppender(univariate_impute_pipe, X_train.columns, num_transformers=3)
feature_type_changer = ConvertFeatureType(convert_to_int=convert_to_int, convert_to_string=convert_to_string,
convert_to_float=convert_to_float, convert_to_bool=convert_to_bool)
poly_feat_creator = CreatePolynomialFeatures(most_corr_feat)
############################################
# 2) Multivariate Version
# convert_to_string= ["airconditioningtypeid", "heatingorsystemtypeid", "fips", "propertylandusetypeid", "regionidcounty", "pooltypeid2", "decktypeid", "taxdelinquencyflag"]
# multivariate_impute_pipe = ColumnTransformer([
# ("impute_cats", SimpleImputer(strategy="constant", fill_value='missing'), cat_impute_vars),
# ("impute_num", IterativeImputer(estimator=RandomForestRegressor(n_estimators=10, max_depth=30, min_samples_leaf=32), random_state=0, max_iter=1), numeric_impute_vars),
# ],
# remainder='passthrough'
# )
# col_name_appender = ColumnNamesAppender(multivariate_impute_pipe, X_train.columns, num_transformers=3)
# feature_type_changer = ConvertFeatureType(convert_to_int=convert_to_int, convert_to_string=convert_to_string, convert_to_float=convert_to_float)
# # most_corr_feat = list(map(lambda x: x.replace('num_scaler__',''), most_corr_feat)) # Clean up feature names by removing `num_scaler__`
# poly_feat_creator = CreatePolynomialFeatures(most_corr_feat)
lin_reg_preprocessor = Pipeline([
('feature_dropper', feature_dropper),
('univariate_impute_pipe', univariate_impute_pipe),
# ('multivariate_impute_pipe', multivariate_impute_pipe),
('col_name_appender', col_name_appender),
('feature_type_changer', feature_type_changer),
('year_feat_creator', year_feat_creator),
('feature_encoder_scaler', feature_encoder_scaler),
('poly_feat_creator', poly_feat_creator),
('skew_transformer', skew_transformer),
])
data_prep_pipe = lin_reg_preprocessor.fit(X_prepared)
X_prepared = lin_reg_preprocessor.transform(X_prepared)
X_prepared_val = lin_reg_preprocessor.transform(X_prepared_val)
from sklearn import set_config
set_config(display='diagram')
lin_reg_preprocessor
"""# Select and Train Models
**As this project is primarily for learning purposes, candidate machine learning algorithms will be analyzed in greater detail before being rejected to better understand the algorithms for future usage.**
"""
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
lin_reg = LinearRegression()
lin_reg.fit(X_prepared, y_train)
# let's try the full preprocessing pipeline on a few training instances
some_data = X_train.iloc[:5]
some_labels = y_train.iloc[:5]
some_data_prepared = data_prep_pipe.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))
# Baseline for RMSE
print(f"MAE Baseline: {y_train.mad()}")
print(f"RMSE Baseline: {y_train.std()}")
def get_eval_metrics(models, X, y_true):
"""
Calculates MAE (Mean Absoulate Error) and RMSE (Root Mean Squared Error) on the data set for input models.
`models`: list of fit models
"""
for model in models:
y_pred= model.predict(X)
rmse = mean_squared_error(y_true, y_pred, squared=False)
mae = mean_absolute_error(y_true, y_pred)
print(f"Model: {model}")
print(f"MAE: {mae}, RMSE: {rmse}")
# Test usage of RMSE function
# get_eval_metrics([lin_reg, ridge_reg, lasso_reg], X_prepared_val, y_val)
def display_scores(model, scores):
print("-"*50)
print("Model:", model)
print("\nScores:", scores)
print("\nMean:", scores.mean())
print("\nStandard deviation:", scores.std())
def get_cross_val_scores(models, X, y, cv=10, fit_params=None):
"""
Performs k-fold cross validation and calculates MAE for each fold for all input models.
`models`: list of fit models
"""
for model in models:
mae = -cross_val_score(model, X, y, scoring="neg_mean_absolute_error", cv=cv, fit_params=fit_params)
display_scores(model, mae)
# Test usage of cross val function
# get_cross_val_scores([lin_reg, ridge_reg], X_prepared, y_train, cv=5)
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_prepared, y_train)
get_eval_metrics([lin_reg], X_prepared_val, y_val)
get_cross_val_scores([lin_reg], X_prepared, y_train, cv=5)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def plot_learning_curves(model, X_train, y_train, X_val, y_val):
"""
Train the input model on different sized subsets and test on validation set.
Output a plot of training and validation error for the different sized subsets.
"""
train_errors, val_errors = [], []
num_instances = np.linspace(1, len(X_train), num=15).astype(int)
for m in num_instances:
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(mean_absolute_error(y_train[:m], y_train_predict))
val_errors.append(mean_absolute_error(y_val, y_val_predict))
plt.plot(num_instances, train_errors, "r-+", linewidth=2, label="train")
plt.plot(num_instances, val_errors, "b-", linewidth=3, label="val")
plt.legend(loc='best')
plt.title(model)
return plt
lin_reg = LinearRegression()
plt = plot_learning_curves(lin_reg, X_prepared, y_train, X_prepared_val, y_val)
plt.ylim(0, 3000)
plt.show()
# Set of alpha values to test
alphas = np.logspace(0,2,20)
print(f"Testing with alphas={alphas}")
# Tune Ridge Regression
ridgecv = RidgeCV(alphas=alphas)
ridgecv.fit(X_prepared, y_train)
print(f"Best Ridge Alpha: {ridgecv.alpha_}")
# Set of alpha values to test
alphas = np.logspace(1,5,20)
print(f"Testing with alphas={alphas}")
# Tune Lasso Regression
lassocv = LassoCV(alphas=alphas)
lassocv.fit(X_prepared, y_train)
print(f"Best Lasso Alpha: {lassocv.alpha_}")
"""**The extremely high value of `alpha` for Lasso regression above will regularize the model heavily and push coefficients to zero.**"""
# Fit using optimal alpha
ridge = Ridge(alpha=ridgecv.alpha_)
ridge.fit(X_prepared, y_train)
lasso = Lasso(alpha=lassocv.alpha_)
lasso.fit(X_prepared, y_train)
# get_eval_metrics([ridge, lasso], X_prepared_val, y_val)
get_cross_val_scores([ridge, lasso], X_prepared, y_train, cv=5)
# Learning Curves for Ridge and Lasso
ridge = Ridge(alpha=ridgecv.alpha_)
plt = plot_learning_curves(ridge, X_prepared, y_train, X_prepared_val, y_val)
plt.show()
lasso = Lasso(alpha=lassocv.alpha_)
plt = plot_learning_curves(lasso, X_prepared, y_train, X_prepared_val, y_val)
plt.show()
"""**As expected, even the regularized Linear Models do not perform any better. Thus, we need to consider more complex algorithms which make fewer assumptions to try and understand the relationship between the features and target variable.** """
X_prepared = X_train.copy()
X_prepared_val = X_val.copy()
# Feature Dropper Pipeline
rf_drop_vars = ['parcelid']
feature_dropper = FeatureDropper(features_to_drop=rf_drop_vars)
# Create Date Features
date_feat_creator = CreateDateFeatures()
final_column_names = list(X_train.columns) + ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter']
# Univariate Impute Pipeline
rf_impute_median_vars = [
"lotsizesquarefeet", "bathroomcnt", "bedroomcnt", "calculatedfinishedsquarefeet", "structuretaxvaluedollarcnt",
"landtaxvaluedollarcnt", "latitude", "longitude", "finishedsquarefeet13", "finishedsquarefeet15",
"finishedfloor1squarefeet", "finishedsquarefeet50", "finishedsquarefeet6", "threequarterbathnbr",
"calculatedbathnbr", "fullbathcnt", "numberofstories", "finishedsquarefeet12", "taxvaluedollarcnt",
"taxamount", "roomcnt", "basementsqft", "yardbuildingsqft26", "unitcnt"
]
rf_impute_0_vars = ["yardbuildingsqft17", "fireplacecnt", "poolcnt", "garagecarcnt", "garagetotalsqft", "poolsizesum"]
rf_cat_vars = [
'airconditioningtypeid', 'architecturalstyletypeid', 'buildingclasstypeid', 'buildingqualitytypeid',
'decktypeid', 'fips', 'hashottuborspa', 'heatingorsystemtypeid', 'pooltypeid10', 'pooltypeid2',
'pooltypeid7', 'propertycountylandusecode', 'propertylandusetypeid', 'propertyzoningdesc',
'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip',
'storytypeid', 'typeconstructiontypeid', 'fireplaceflag', 'assessmentyear',
'taxdelinquencyflag', 'taxdelinquencyyear', 'censustractandblock', 'yearbuilt',
'transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter',
]
univariate_impute_pipe = ColumnTransformer([
("rf_impute_cats", SimpleImputer(strategy="constant", fill_value="missing"), rf_cat_vars),
("rf_impute_0_vars", SimpleImputer(strategy="constant", fill_value=0), rf_impute_0_vars),
("rf_impute_median", SimpleImputer(strategy="median"), rf_impute_median_vars),
],
remainder='passthrough'
)
# Column Names Appender
col_name_appender = ColumnNamesAppender(univariate_impute_pipe, final_column_names, num_transformers=3)
# Convert to Cat + One-Hot Encoding
convert_to_cat = ConvertToType(var_type='str', vars_to_convert=rf_cat_vars)
feature_encoder = ColumnTransformer([
("ohe_cats", OneHotEncoder(handle_unknown='ignore'), rf_cat_vars)
],
remainder='passthrough'
)
rf_preprocessor = Pipeline([
('date_feat_creator', date_feat_creator),
('feature_dropper', feature_dropper),
('univariate_impute_pipe', univariate_impute_pipe),
('col_name_appender', col_name_appender),
('convert_to_cat', convert_to_cat),
('feature_encoder', feature_encoder),
])
data_prep_pipe = rf_preprocessor.fit(X_prepared)
X_prepared = rf_preprocessor.transform(X_prepared)
X_prepared_val = rf_preprocessor.transform(X_prepared_val)
X_rf = X_prepared.copy()
X_rf_val = X_prepared_val.copy()
"""### Random Search + Grid Search Results
Below are the results from using Random Search in combination with Grid Search to arrive at the optimized hyperparameters. Sample code for each method is shown in cells below.
1) ``Random Search with max_samples=0.1``
Random Grid with 20 different combinations =
'max_depth': [5, 16, 27, 38, 50, None],
'max_features': ['auto', 0.6, 0.7, 0.8],
'min_samples_leaf': [32, 64, 128, 256],
Best Params =
{'max_depth': 50,
'max_features': 0.6,
'min_samples_leaf': 32,}
2) ``Grid Search with max_samples=0.1``
Grid =
{'max_features': ['auto', 0.5, 0.6, 0.8, 0.9],
'max_depth': [10, 40, 70, 100, None],
'min_samples_leaf': [16, 32, 64],}
Best Params:
{'max_depth': 70,
'max_features': 0.6,
'min_samples_leaf': 32,}
3) ``Final Grid Search``: Tune ``n_estimators`` and ``max_samples``
Grid =
{'max_features': [0.6],
'max_depth': [70],
'min_samples_leaf': [32]
'max_samples': [0.1, 0.3, 0.5],
'n_estimators': [100, 400, 600]}
Best Params:
{'n_estimators': 400,
'max_samples': 0.3,
'max_depth': 70,
'max_features': 0.6,
'min_samples_leaf': 32,}
**Given that Randon Forest are already great out-of-the-box and the increasingly long training times, there is no benefit in further optimizing the hyperparameters for the Random Forest Regressor.**
"""
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
# Fraction of original dataset given to any individual tree
max_samples = [0.1, 0.2]
# Number of features to consider at every split
#max_features = ['auto', 0.4, 0.6, 0.8, 0.9]
max_features = [0.4, 0.8]
# Maximum number of levels in tree
#max_depth = [int(x) for x in np.linspace(50, 100, num = 3)]
max_depth = [int(x) for x in np.linspace(20, 60, num = 3)]
# max_depth.append(None)
# Minimum number of samples required at each leaf node
min_samples_leaf = [16, 32, 48]
# Create the tuning grid
param_grid = {
'max_samples': max_samples,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_leaf': min_samples_leaf,
}
###### Random Grid Sample Code #######
# Use the random grid to narrow down best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(max_samples=0.1)
# Random search of parameters, using 3 fold cross validation,
# search across 20 different combinations, and use all available cores
#rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=3, cv=3, verbose=2, random_state=42, n_jobs=-1)
# Fit the random search model
rf_random.fit(X_prepared, y_train)
###### Grid Search Sample Code #######
# Use the Grid search to further narrown down best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Grid search of parameters, using 3 fold cross validation
rf_grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
# Fit the random search model
rf_grid.fit(X_prepared, y_train)
# Output the best estimator from grid search and it's hyperparameters
best_rf = rf_grid.best_estimator_
print(best_rf)
#best_rf.best_params_
# BEWARE: With high max_samples and 400 n_estimators, this code takes almost hour to run
best_params = {
#'n_estimators': 400,
'n_estimators': 100,
#'max_samples': 0.3,
'max_samples': 0.2,
#'max_depth': 70,
'max_depth': 40,
#'max_features': 0.6,
'max_features': 0.4,
#'min_samples_leaf': 32,
'min_samples_leaf': 16,
'random_state': 42,
}
ran_forest = RandomForestRegressor(**best_params)
ran_forest.fit(X_prepared, y_train)
get_eval_metrics([ran_forest], X_prepared_val, y_val)
# get_cross_val_scores([ran_forest], X_prepared, y_train, cv=3)
# WARNING: With high max_samples and high n_estimators, this code takes almost 2 hours to run
from sklearn.ensemble import ExtraTreesRegressor
# Using the best parameters from Random Forest Regressor
best_params = {
'n_estimators': 100,
'max_samples': 0.3,
'max_depth': 70,
'max_features': 0.6,
'min_samples_leaf': 32,
'random_state': 42,
}
extra_trees_reg = ExtraTreesRegressor(**best_params)
extra_trees_reg.fit(X_prepared, y_train)
# get_cross_val_scores([extra_trees_reg], X_prepared, y_train, cv=3)
# Linear SVM Regression
# Trains extremely quickly but not useful as the Zillow dataset is not linearly separable
from sklearn.svm import LinearSVR
svm_reg_linear = LinearSVR(random_state=42)
svm_reg_linear.fit(X_prepared, y_train)
# SVM with Polynomial Kernel - 'transforms' data points to polynomial space using 'kernel trick'
from sklearn.svm import SVR
svm_poly_reg = SVR(kernel="poly")
svm_poly_reg.fit(X_prepared, y_train)
# SVM with Radial Basis Function (RBF) Kernel
rbf_svm = SVR(kernel = 'rbf')
rbf_svm.fit(X_prepared, y_train)
get_eval_metrics([svm_reg_linear, svm_poly_reg, rbf_svm], X_prepared_val, y_val)
"""# 4) Gradient Boosting Machines
# 4-a) XGBoost
Even though Scikit-Learn has its own implementation of Gradient Boost, it lacks in both performance and speed compared to other implementations. Thus, we will skip Scikit-Learn's implementation in favor of XGBoost, LightGBM, CatBoost.
### Missing Data
One of most powerful capabilities of Gradient Boosting machines is their ability to handle missing data. They do not require data imputation and are designed to extract as much information as possible from the rows with missing data.
"""
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
X_prepared = X_train.copy()
X_prepared_val = X_val.copy()
# Data Prep for XGBoost with Original Data
# Feature Dropper Pipeline
xgb_drop = ['parcelid']
feature_dropper = FeatureDropper(features_to_drop=xgb_drop)
# Convert Year Features Pipeline
year_feat_creator = CreateYearFeatures(date_features=date_features)
# Date Feature Creator
date_feat_creator = CreateDateFeatures()
# Feature Encoding Pipeline
cat_vars = ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter',
'airconditioningtypeid', 'architecturalstyletypeid', 'buildingclasstypeid', 'buildingqualitytypeid',
'decktypeid', 'fips', 'hashottuborspa', 'heatingorsystemtypeid', 'pooltypeid10', 'pooltypeid2',
'pooltypeid7', 'propertycountylandusecode', 'propertylandusetypeid', 'propertyzoningdesc',
'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip',
'storytypeid', 'typeconstructiontypeid', 'fireplaceflag', 'assessmentyear',
'taxdelinquencyflag', 'taxdelinquencyyear', 'censustractandblock',
]
feature_encoder = ColumnTransformer([
("ohe_cats", OneHotEncoder(handle_unknown='ignore'), cat_vars)
],
remainder='passthrough'
)
xgb_preprocessor = Pipeline([
('feature_dropper', feature_dropper),
('date_feat_creator', date_feat_creator),
('year_feat_creator', year_feat_creator),
('feature_encoder', feature_encoder),
])
data_prep_pipe = xgb_preprocessor.fit(X_prepared)
X_prepared = xgb_preprocessor.transform(X_prepared)
X_prepared_val = xgb_preprocessor.transform(X_prepared_val)
X_xgb = X_prepared.copy()
X_xgb_val = X_prepared_val.copy()
# Initialize XGBoost Model
params = {
'learning_rate': 0.3,
'n_estimators': 10000,
'random_state': 42,
}
xgb_base = xgb.XGBRegressor(**params)
# Fit model using early stopping and validation set
fit_params={'early_stopping_rounds': 10,
'eval_metric': 'mae',
'eval_set': [[X_prepared_val, y_val]]}
xgb_base.fit(X_prepared, y_train, **fit_params)
fit_params={'early_stopping_rounds': 10,
'eval_metric': 'mae',
'verbose': False,
'eval_set': [[X_prepared_val, y_val]]}
# get_eval_metrics([xgb_base], X_prepared_val, y_val)
get_cross_val_scores([xgb_base], X_prepared, y_train, cv=3, fit_params=fit_params)
fit_params={
'early_stopping_rounds': 10,
'eval_metric': 'mae',
'verbose': False,
'eval_set': [[X_prepared_val, y_val]],
}
other_params = {
'learning_rate': 0.2,
'n_estimators': 10000,
'random_state': 42,
}
def param_exploration(param_name, param_vals, other_params=other_params, fit_params=fit_params, cross_val=False):
"""
Function to explore the impact of different values for ONE parameter on MAE.
Tested using either single validation test set or cross validation (cross_val=True)
"""
mae_vec = np.zeros(len(param_vals))
for i, val in enumerate(param_vals):
params[param_name] = val
xgb_temp = xgb.XGBRegressor(**params)
if cross_val:
# TESTING WITH CROSS VALIDATION
mae = -cross_val_score(xgb_temp, X_prepared, y_train, scoring="neg_mean_absolute_error", cv=3, fit_params=fit_params)
mae_vec[i] = mae.mean()
else:
# TESTING WITH SINGLE VALIDATION SET
xgb_temp.fit(X_prepared, y_train, **fit_params)
preds = xgb_temp.predict(X_prepared_val)
mae_vec[i] = mean_absolute_error(y_val, preds)
plt.plot(param_vals, mae_vec)
# Max Depth Tuning
# From initial testing, cross validation was necessary when testing the different values as the model was starting to overfit on the single validation set.
# BEWARE - TAKES 5-10 MINUTES TO RUN
max_depth_vals_vec=list(range(1,7))
param_exploration(param_name="max_depth", param_vals=max_depth_vals_vec, cross_val=True)
"""## Manually Exploring Parameters (One by One)
**Warning**: Each of the plots generated below take a few minutes to run.
"""
# Subsample: Fraction of dataset to use to build each tree
param_vals_vec=[0.3, 0.5, 0.6, 0.7, 0.8, 0.9, .95, 1]
param_exploration(param_name="subsample", param_vals=param_vals_vec, cross_val=False)
# ColSample By Tree: Ration of columns chosen when constructing each tree
param_vals_vec=[0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
param_exploration(param_name="colsample_bytree", param_vals=param_vals_vec, cross_val=False)
# ColSample By Node: Ratio of columns chosen at each node (split)
param_vals_vec=[0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
param_exploration(param_name="colsample_bynode", param_vals=param_vals_vec, cross_val=False)
# Reg Lambda: Regularization parameter from Ridge Regression, the larger the value, the more conservative the algorithm
param_vals_vec=[1, 2, 3, 5, 10, 20]
param_exploration(param_name="reg_lambda", param_vals=param_vals_vec, cross_val=False)
# Reg Alpha: Regularization parameter from Lasso Regression, the larger the value, the more conservative the algorithm
param_vals_vec=[1, 2, 3, 5, 10, 20]
param_exploration(param_name="reg_alpha", param_vals=param_vals_vec, cross_val=False)
# Gamma: Another regularization paramter - the larger the value, the more conservative the algorithm
param_vals_vec=[0.1, 0.3, 0.5, 0.7, 1, 2]
param_exploration(param_name="gamma", param_vals=param_vals_vec, cross_val=False)
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
"""Must define the "loss function" which takes a set of parameters and output the loss value"""
def eval_model(params):
xgb_base = xgb.XGBRegressor()
xgb_base.set_params(**params)
# TESTING WITH SINGLE VALIDATION SET
# xgb_base.fit(X_prepared, y_train, eval_set=[(X_prepared_val, y_val)], early_stopping_rounds=20,
# eval_metric='mae', verbose=False)
# preds = xgb_base.predict(X_prepared_val)
# mae = mean_absolute_error(y_val, preds)
# TESTING WITH CROSS VALIDATION
fit_params={'early_stopping_rounds': 15,
'eval_metric': 'mae',
'verbose': False,
'eval_set': [[X_prepared_val, y_val]]}
mae = -cross_val_score(xgb_base, X_prepared, y_train, scoring="neg_mean_absolute_error", cv=3, fit_params=fit_params)
mae = mae.mean()
return(mae)
"""- Next, we define the parameter space.
- There are several options to define ranges.
- `randint`: chooses a random integer between 0 and an upper bound
- `choice`: choose from a set of specific values
- `uniform`: choose a (float) in the range
- `quniform`: choose a discrete range of floats
See more at hyperopt documentation:
http://hyperopt.github.io/hyperopt/getting-started/search_spaces/
"""
fspace1 = {
'max_depth':hp.randint('max_depth', 7), # goes up to 6-1
'colsample_bynode': hp.uniform('colsample_bynode',.3,1),
'subsample': hp.quniform('subsample',.3, 1,.05),
'gamma':hp.uniform('gamma',.2, 5),
'tree_method': hp.choice('tree_method', ['approx', 'exact']),
'n_estimators': 10000,
'learning_rate': 0.15,
'random_state': 42,
}
# Finally, we use the `fmin` function and record our trials in a `Trials` object
# BEWARE: THIS CODE TOOK 7+ HOURS TO RUN WHEN PERFORMING 1200 TRIALS
trials1 = Trials()
best_param = fmin(fn=eval_model, space=fspace1, algo=tpe.suggest, max_evals=10, trials=trials1)
best_param
"""The individual trials can be explored to detect trends within specific parameters
"""
# Extract data from hyperopt trials
max_depth_trial_vals = np.array([t['misc']['vals']['max_depth'] for t in trials1.trials])
ss_trial_vals = np.array([t['misc']['vals']['subsample'] for t in trials1.trials])
gamma_trial_vals = np.array([t['misc']['vals']['gamma'] for t in trials1.trials])
loss_trial_vals = np.array([t['result']['loss'] for t in trials1.trials])
trial_nums = np.array([t['tid'] for t in trials1.trials])
"""Plot trends between different parameter values and corresponding losses.
For example, the ``max_depth`` plot shows decreasing losses with an increase in depth which plateaus are a certain value - due to Kaggle notebook refreshing at startup, the plot for 1200 trials experiment in unavailable anymore :(
"""
# Plot trends between different parameter values and corresponding losses
plt.scatter(max_depth_trial_vals, loss_trial_vals, alpha=.2)
# plt.scatter(ss_trial_vals, loss_trial_vals, alpha=.2)
# plt.scatter(gamma_trial_vals, loss_trial_vals, alpha=.2)
"""### Best Params Model
Based on the 1200 ``hyperopt`` trials, the following hyperparameter values had the best loss:
```
{'colsample_bynode': 0.382,
'gamma': 0.201,
'max_depth': 5,
'subsample': 0.95,
'tree_method': 'exact'}
```
The other important parameters (``learning_rate``, ``n_estimators``) were already set.
"""
fit_params={
'early_stopping_rounds': 15,
'eval_metric': 'mae',
'verbose': False,
'eval_set': [[X_prepared_val, y_val]],
}
best_params = {
'colsample_bynode': 0.75,
'gamma': 0.201,
'max_depth': 5,
'subsample': 0.95,
'tree_method': 'exact',
'learning_rate': 0.01,
'n_estimators': 10000,
'random_state': 42,
}
xgb_base = xgb.XGBRegressor(**best_params)
xgb_base.fit(X_prepared, y_train, **fit_params)
get_cross_val_scores([xgb_base], X_prepared, y_train, cv=3, fit_params=fit_params)
!pip install optuna
import lightgbm as lgbm
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
import optuna # !pip install optuna
from optuna.integration import LightGBMPruningCallback
X_prepared = X_train.copy()
X_prepared_val = X_val.copy()
class ConvertToCategorical(BaseEstimator, TransformerMixin):
'''
Pipeline to convert categorical variables to Pandas Category type.
This pipeline is specific to LGBM which handles categorical vars differently.
'''
def __init__(self, cat_vars):
self.cat_vars = cat_vars
def fit(self, X, y=None):
return self # Nothing else to do
def transform(self, X):
for col in self.cat_vars:
X[col] = pd.Categorical(X[col])
return X
# Code to test pipeline
# convert_to_cat = ConvertToCategorical(cat_vars=cat_vars)
# X_temp = convert_to_cat.fit_transform(X_temp)
# Feature Dropper Pipeline
var_drop = ['parcelid',]
feature_dropper = FeatureDropper(features_to_drop=var_drop)
# Convert Date Features Pipeline
year_feat_creator = CreateYearFeatures(date_features=date_features)
# Date Feature Creator
date_feat_creator = CreateDateFeatures()
# Derived Features Creator
derived_feat_creator = CreateDerivedFeatures()
# Feature Encoding Pipeline
cat_vars = ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid',
'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertylandusetypeid', 'regionidcity', 'regionidcounty',
'regionidneighborhood', 'regionidzip', 'assessmentyear', 'typeconstructiontypeid', 'architecturalstyletypeid',
'buildingclasstypeid', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'hashottuborspa', 'pooltypeid2',
'taxdelinquencyyear', 'taxdelinquencyflag', 'fireplaceflag', 'decktypeid', 'pooltypeid10',
'propertycountylandusecode', 'propertyzoningdesc', 'rawcensustractandblock', 'censustractandblock'
]
# Feature Encoding Pipeline
feature_encoder = ColumnTransformer([
("ohe_cats", OneHotEncoder(handle_unknown='ignore'), cat_vars)],
remainder='passthrough'
)
lgbm_preprocessor = Pipeline([
('derived_feat_creator', derived_feat_creator),
('feature_dropper', feature_dropper),
('date_feat_creator', date_feat_creator),
('year_feat_creator', year_feat_creator),
('feature_encoder', feature_encoder),
])
data_prep_pipe = lgbm_preprocessor.fit(X_prepared)
X_prepared = lgbm_preprocessor.transform(X_prepared)
X_prepared_val = lgbm_preprocessor.transform(X_prepared_val)
X_lgbm = X_prepared.copy()
X_lgbm_val = X_prepared_val.copy()
# Initialize LightGBM Model
params = {
'n_estimators': 10000,
'random_state': 42
}
# Fit model using early stopping and validation set
fit_params={'early_stopping_rounds': 30,
'eval_metric': 'mae',
'verbose': False,
'eval_set': [[X_prepared_val, y_val]]
}
lgbm_base = lgbm.LGBMRegressor(**params)
lgbm_base.fit(X_prepared, y_train, **fit_params)
get_cross_val_scores([lgbm_base], X_prepared, y_train, cv=3, fit_params=fit_params)
def objective(trial, X, y):
param_grid = {
"n_estimators": trial.suggest_categorical("n_estimators", [10000]),
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
"num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
"max_depth": trial.suggest_int("max_depth", 3, 12),
"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
"lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
"lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
"bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95, step=0.1),
"bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
"feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95, step=0.1),
}
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = np.empty(5)
for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
#X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
model = lgbm.LGBMRegressor(**param_grid)
model.fit(
X_train,
y_train,
eval_set=[(X_test, y_test)],
eval_metric="mae",
verbose=False,
early_stopping_rounds=25,
callbacks=[
LightGBMPruningCallback(trial, "l1")
], # Add a pruning callback
)
preds = model.predict(X_test)
cv_scores[idx] = mean_absolute_error(y_test, preds)
return np.mean(cv_scores)
# Similar to Hyperopt - this code creates an experiment using the objective function and parameter grid above
study = optuna.create_study(direction="minimize", study_name="LGBM Regressor")
func = lambda trial: objective(trial, X_prepared, y_train.to_numpy())
study.optimize(func, n_trials=100)
print(f"\tBest value (mae): {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
print(f"\t\t{key}: {value}")
"""### Best Params Model
Based on the 10000 ``Optuna`` trials, the following hyperparameter values had the best loss:
```
best_params = {
'num_leaves': 512,
'max_depth': 9,
'lambda_l2': 15,
'bagging_fraction': 0.9,
'feature_fraction': 0.6,
'learning_rate': 0.01,
'n_estimators': 10000,
'random_state': 42,
}
```
The other important parameters (``learning_rate``, ``n_estimators``) were already set.
"""
fit_params={
'early_stopping_rounds': 30,
'eval_metric': 'mae',
'verbose': False,
'eval_set': [[X_prepared_val, y_val]]
}
best_params = {
'num_leaves': 512,
'max_depth': 9,
'lambda_l2': 15,
'bagging_fraction': 0.9,
'feature_fraction': 0.6,
'learning_rate': 0.01,
'n_estimators': 10000,
'random_state': 42,
}
lgbm_base = lgbm.LGBMRegressor(**best_params)
lgbm_base.fit(X_prepared, y_train, **fit_params)
get_cross_val_scores([lgbm_base], X_prepared, y_train, cv=3, fit_params=fit_params)
!pip install catboost
import catboost as cb
import optuna
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
X_prepared = X_train.copy()
X_prepared_val = X_val.copy()
# Feature Dropper Pipeline
var_drop = ['parcelid',]
feature_dropper = FeatureDropper(features_to_drop=var_drop)
# Convert Date Features Pipeline
year_feat_creator = CreateYearFeatures(date_features=date_features)
# Feature Encoding Pipeline
cat_vars = ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid',
'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertylandusetypeid', 'regionidcity', 'regionidcounty',
'regionidneighborhood', 'regionidzip', 'assessmentyear', 'typeconstructiontypeid', 'architecturalstyletypeid',
'buildingclasstypeid', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'hashottuborspa', 'pooltypeid2',
'taxdelinquencyyear', 'taxdelinquencyflag', 'fireplaceflag', 'decktypeid', 'pooltypeid10',
'propertycountylandusecode', 'propertyzoningdesc', 'rawcensustractandblock', 'censustractandblock'
]
convert_to_cat = ConvertToType(var_type='str', vars_to_convert=cat_vars)
# Date Feature Creator
date_feat_creator = CreateDateFeatures()
# Derived Features Creator
derived_feat_creator = CreateDerivedFeatures()
# Aggregated Feature Creator
group_col = 'regionidcity'
agg_cols = ['lotsizesquarefeet', 'yearbuilt', 'calculatedfinishedsquarefeet',
'structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt', 'taxamount', 'property_tax_per_sqft']
aggregated_feat_creator = CreateAggregatedFeatures(group_col=group_col, agg_cols=agg_cols)
cb_preprocessor = Pipeline([
('derived_feat_creator', derived_feat_creator),
# ('aggregated_feat_creator', aggregated_feat_creator),
('feature_dropper', feature_dropper),
('date_feat_creator', date_feat_creator),
('year_feat_creator', year_feat_creator),
('convert_to_cat', convert_to_cat),
])
data_prep_pipe = cb_preprocessor.fit(X_prepared)
X_prepared = cb_preprocessor.transform(X_prepared)
X_prepared_val = cb_preprocessor.transform(X_prepared_val)
X_cb = X_prepared.copy()
X_cb_val = X_prepared_val.copy()
# Initialize CatBoost Model
params = {
'n_estimators': 100,
'random_state': 42,
'eval_metric': 'MAE',
}
# Fit model using early stopping and validation set
fit_params={
'eval_set': (X_prepared_val, y_val),
'cat_features': cat_vars,
'early_stopping_rounds': 10,
'verbose': False,
}
cb_base = cb.CatBoostRegressor(**params)
cb_base.fit(X_prepared, y_train, **fit_params)
get_eval_metrics([cb_base], X_prepared_val, y_val)
# get_cross_val_scores([cb_base], X_prepared, y_train, cv=3, fit_params=fit_params)
"""## CatBoost Hyperparameter Tuning (Using Optuna)
Given the better documentation for Optuna compared to ``hyperopt``, I decided to continue using it for CatBoost Hyperparameter tuning.
"""
def objective(trial, X, y):
param_grid = {
# Fixed Params
"eval_metric": "MAE",
"n_estimators": 10000,
"random_state": 42,
# Tuned Params
"learning_rate": trial.suggest_loguniform("learning_rate", 0.05, 0.3),
"l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e1),
"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
"depth": trial.suggest_int("depth", 1, 10),
"boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
"bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
"one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),
}
# Conditional Hyper-Parameters
if param_grid["bootstrap_type"] == "Bayesian":
param_grid["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
elif param_grid["bootstrap_type"] == "Bernoulli":
param_grid["subsample"] = trial.suggest_float("subsample", 0.1, 1)
cv = KFold(n_splits=3, shuffle=True, random_state=42)
cv_scores = np.empty(3)
for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
model = cb.CatBoostRegressor(**param_grid)
model.fit(
X_train,
y_train,
eval_set=[(X_test, y_test)],
cat_features=cat_vars,
early_stopping_rounds=20,
verbose=False
)
preds = model.predict(X_test)
cv_scores[idx] = mean_absolute_error(y_test, preds)
return np.mean(cv_scores)
# Similar to Hyperopt - this code creates an experiment using the objective function and parameter grid above
study = optuna.create_study(direction="minimize", study_name="CatBoost Regressor")
func = lambda trial: objective(trial, X_prepared, y_train.to_numpy())
study.optimize(func, n_trials=100)
print(f"\tBest value (mae): {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
print(f"\t\t{key}: {value}")
"""### Best Params Model
Based on the 400 ``Optuna`` trials, the following hyperparameter values had the best loss of MAE: 0.05265
```
best_params = {
'learning_rate': 0.01,
'l2_leaf_reg': 3,
'depth': 6,
'n_estimators': 900,
'random_state': 42,
'eval_metric': 'MAE',
'loss_function': 'MAE'
}
```
"""
best_params = {
'learning_rate': 0.03,
'l2_leaf_reg': 3,
'depth': 6,
'n_estimators': 900,
'random_state': 42,
'eval_metric': "MAE",
'loss_function': 'MAE',
}
fit_params={
'cat_features': cat_vars,
'verbose': False,
}
cb_base = cb.CatBoostRegressor(**best_params)
cb_base.fit(X_prepared, y_train, **fit_params)
get_eval_metrics([cb_base], X_prepared_val, y_val)
# get_cross_val_scores([cb_base], X_prepared, y_train, cv=3, fit_params=fit_params)
cb_importances = cb_base.get_feature_importance()
cb_importances = pd.DataFrame(sorted(zip(cb_importances, X_cb.columns)), columns=['Value','Feature'])
# Select 15 most important features
cb_importances = cb_importances.sort_values(by='Value', ascending=False)[:20]
# Plot importances
plt.figure(figsize=(10, 10))
sns.barplot(x="Value", y="Feature", data=cb_importances.sort_values(by="Value", ascending=False))
plt.title('CatBoost Global Feature Importances')
plt.tight_layout()
plt.show()
"""Analyzing the plot above shows how the location related features (``propertycountylandusecode``, ``regionidzip``) are some of the most important followed by the age of house and ``calculatedfinishedsquarefeet``. """
!pip install ml_insights
# !pip install ml_insights #if you don't have it installed
import ml_insights as mli
range_pts = mli.get_range_dict(X_cb)
test_pts = X_cb_val.sample(5, random_state=40)
imp_num_feat = [
'calculatedfinishedsquarefeet', 'house_age',
'taxamount', 'structuretaxvaluedollarcnt', 'latitude',
]
"""The ranges of values used for x-axis need to be updated to remove outliers to better interpret the ICE-plot."""
# Updating the ranges of features on x-axis for better interpretation
range_pts['calculatedfinishedsquarefeet'] = np.linspace(0, 6000, 200)
range_pts['house_age'] = np.linspace(0, 130, 65)
range_pts['taxamount'] = np.linspace(0, 25000, 200)
range_pts['structuretaxvaluedollarcnt'] = np.linspace(0, 300000, 200)
range_pts['latitude'] = np.linspace(3.35e+07, 3.46e+07, 200)
mli.ice_plot(cb_base, test_pts, imp_num_feat, range_pts=range_pts, pred_fn='predict', figsize=(15,10))
!pip install shap
# !pip install shap #if you don't have it installed
import shap
shap.initjs()
explainer = shap.Explainer(cb_base)
shap_values = explainer(X_cb)
# The SHAP expected value is the median of the target variable
explainer.expected_value, np.mean(y_train)
!pip install ipython
from IPython.core.display import display, HTML
shap.initjs()
y_train1 = y_train.reset_index(drop=True)
# Now you can calculate the percentiles without the offset
i_med = np.argsort(y_train1)[len(y_train1)//2]
i_71 = np.argsort(y_train1)[int(len(y_train1) * 0.75)]
i_20 = np.argsort(y_train1)[int(len(y_train1) * 0.20)]
shap.plots.waterfall(shap_values[i_71], max_display=15)
#shap_html = shap.force_plot(explainer.expected_value, shap_values[i_med].values, matplotlib=True)
#shap_html.savefig(course_path + 'shap_force_plot.png')
shap.force_plot(explainer.expected_value, shap_values[i_med].values, matplotlib=True)
plt.savefig(course_path +'shap_force_plot.png')
y_train1 = y_train.reset_index(drop=True)
# Now you can calculate the percentiles without the offset
i_med = np.argsort(y_train1)[len(y_train1)//2]
i_71 = np.argsort(y_train1)[int(len(y_train1) * 0.75)]
i_20 = np.argsort(y_train1)[int(len(y_train1) * 0.20)]
"""### Force Plots
Force plots are equivalent representations as Waterfall plots that display the key information is a more condensed format.
"""
# Force plot for 20th percentile y_train value
shap.force_plot(explainer.expected_value, shap_values[i_20].values, matplotlib=True)
plt.savefig(course_path +'shap_force_plot20.png')
"""## SHAP for Global Interpretability
Thanks to its versatility, SHAP values can also be used to get a sense of how "globally" important a feature is by aggregating the (absolute value of) the local feature values. These numbers can be thought of as the "average absolute impact" that a variable has on the prediction.
"""
shap.plots.bar(shap_values, max_display=20)
def fit_rf(X_prepared, y_train, random_state=42):
best_params = {
'n_estimators': 400,
'max_samples': 0.3,
'max_depth': 70,
'max_features': 0.6,
'min_samples_leaf': 32,
'random_state': random_state,
}
ran_forest = RandomForestRegressor(**best_params)
ran_forest.fit(X_prepared, y_train)
return ran_forest
def fit_xgb(X_prepared, y_train, random_state=42):
fit_params={
'early_stopping_rounds': 15,
'eval_metric': 'mae',
'verbose': False,
'eval_set': [[X_xgb_val, y_val]],
}
best_params = {
'colsample_bynode': 0.75,
'gamma': 0.201,
'max_depth': 5,
'subsample': 0.95,
'tree_method': 'exact',
'learning_rate': 0.01,
'n_estimators': 10000,
'random_state': random_state,
}
xgb_base = xgb.XGBRegressor(**best_params)
xgb_base.fit(X_prepared, y_train, **fit_params)
return xgb_base
def fit_lgbm(X_prepared, y_train, random_state=42):
fit_params={
'early_stopping_rounds': 30,
'eval_metric': 'mae',
'verbose': False,
'eval_set': [[X_lgbm_val, y_val]]
}
best_params = {
'num_leaves': 512,
'max_depth': 9,
'lambda_l2': 15,
'bagging_fraction': 0.9,
'feature_fraction': 0.6,
'learning_rate': 0.01,
'n_estimators': 10000,
'random_state': random_state,
}
lgbm_base = lgbm.LGBMRegressor(**best_params)
lgbm_base.fit(X_prepared, y_train, **fit_params)
return lgbm_base
def fit_catboost(X_prepared, y_train, random_state=42):
best_params = {
'learning_rate': 0.03,
'l2_leaf_reg': 3,
'depth': 6,
'n_estimators': 900,
'random_state': random_state,
'eval_metric': "MAE",
'loss_function': 'MAE',
}
fit_params={
'cat_features': cat_vars,
'verbose': False,
}
cb_base = cb.CatBoostRegressor(**best_params)
cb_base.fit(X_prepared, y_train, **fit_params)
return cb_base
def get_out_of_fold_preds(X_prepared, y_train, model_fcn, random_state=42):
"""
Use K-Fold Cross Validation to create out-of-fold predictions for supplied model.
The out-of-fold predictions can then be used to train a Meta Learner to further
improve the model's performance.
"""
model_preds = list()
meta_y = list()
kfold = KFold(n_splits=3, shuffle=True, random_state=42)
for train_ix, test_ix in kfold.split(X_prepared):
# Depending on whether X_prepared is a dataframe or numpy array, use appropriate indexing
if isinstance(X_prepared, pd.DataFrame):
train_X, test_X = X_prepared.iloc[train_ix], X_prepared.iloc[test_ix]
else:
train_X, test_X = X_prepared[train_ix], X_prepared[test_ix]
train_y, test_y = y_train.iloc[train_ix], y_train.iloc[test_ix]
model = model_fcn(train_X, train_y, random_state=random_state)
preds = model.predict(test_X)
meta_y.extend(test_y)
model_preds.extend(preds)
return model_preds, meta_y
"""
from sklearn.model_selection import KFold
def get_out_of_fold_preds(X_prepared, y_train, model_fcn, random_state=42):
Use K-Fold Cross Validation to create out-of-fold predictions for supplied model.
The out-of-fold predictions can then be used to train a Meta Learner to further
improve the model's performance.
model_preds = list()
meta_y = list()
kfold = KFold(n_splits=3, shuffle=True, random_state=42)
for train_ix, test_ix in kfold.split(X_prepared):
# Depending on whether X_prepared is a dataframe or numpy array, uncomment one line below
# train_X, test_X = X_prepared.iloc[train_ix], X_prepared.iloc[test_ix]
train_X, test_X = X_prepared[train_ix], X_prepared[test_ix]
train_y, test_y = y_train.iloc[train_ix], y_train.iloc[test_ix]
model = model_fcn(train_X, train_y, random_state=random_state)
preds = model.predict(test_X)
meta_y.extend(test_y)
model_preds.extend(preds)
return model_preds, meta_y
"""
# Get Out-of-fold predictions for each base learner
# WARNING: This cell takes multiple hours to run (remove Random Forest regressor to save time which takes significantly longer than others)
xgb_preds, meta_y = get_out_of_fold_preds(X_xgb, y_train, fit_xgb)
lgbm_preds, meta_y = get_out_of_fold_preds(X_lgbm, y_train, fit_lgbm)
rf_preds, meta_y = get_out_of_fold_preds(X_rf, y_train, fit_rf)
X_cb_reset = X_cb.reset_index(drop=True)
y_train_reset = y_train.reset_index(drop=True)
cb_preds, meta_y = get_out_of_fold_preds(X_cb_reset, y_train_reset, fit_catboost)
# Create meta_X training set by combining out-of-fold predictions from individual learners
yhat1 = array(rf_preds).reshape((len(rf_preds), 1))
yhat2 = array(xgb_preds).reshape((len(xgb_preds), 1))
yhat3 = array(lgbm_preds).reshape((len(lgbm_preds), 1))
yhat4 = array(cb_preds).reshape((len(cb_preds), 1))
meta_X = hstack((yhat1, yhat2, yhat3, yhat4))
meta_X
# Create validation datset for testing Meta Learner
# Make predictions on original validation sets using originally trained base learners
xgb_val_preds = xgb_base.predict(X_xgb_val)
lgbm_val_preds = lgbm_base.predict(X_lgbm_val)
cb_val_preds = cb_base.predict(X_cb_val)
rf_val_preds = ran_forest.predict(X_rf_val)
# Create validation dataset
yhat1 = array(rf_val_preds).reshape((len(rf_val_preds), 1))
yhat2 = array(xgb_val_preds).reshape((len(xgb_val_preds), 1))
yhat3 = array(lgbm_val_preds).reshape((len(lgbm_val_preds), 1))
yhat4 = array(cb_val_preds).reshape((len(cb_val_preds), 1))
meta_X_val = hstack((yhat1, yhat2, yhat3, yhat4))
meta_X_val
meta_X.shape, meta_X_val.shape
"""## Meta Learners
Train a Meta Learner using training set created from out-of-fold predictions of individual learners
"""
# Initialize LightGBM Model
best_params = {
'num_leaves': 128,
'max_depth': 6,
'lambda_l2': 15,
'bagging_fraction': 0.6,
'learning_rate': 0.01,
'n_estimators': 10000,
'random_state': 42,
}
# Fit model using early stopping and validation set
fit_params={'early_stopping_rounds': 30,
'eval_metric': 'mae',
'verbose': False,
'eval_set': [[meta_X_val, y_val]]
}
lgbm_stacked = lgbm.LGBMRegressor(**best_params)
lgbm_stacked.fit(meta_X, meta_y, **fit_params)
# CatBoost Meta Learner
best_params = {
'learning_rate': 0.01,
'l2_leaf_reg': 15,
'depth': 6,
'min_data_in_leaf': 32,
'subsample': 0.6,
'n_estimators': 10000,
'random_state': 42,
'eval_metric': "MAE",
}
fit_params={
'eval_set': (meta_X_val, y_val),
'early_stopping_rounds': 30,
'verbose': False,
}
cb_stacked = cb.CatBoostRegressor(**best_params)
cb_stacked.fit(meta_X, meta_y, **fit_params)
# Meta Model MAE on Validation Set
y_preds = lgbm_stacked.predict(meta_X_val)
# y_preds = cb_stacked.predict(meta_X_val)
mae = mean_absolute_error(y_val, y_preds)
rmse = mean_squared_error(y_val, y_preds, squared=False)
print(f"Stacked Model Evaluation")
print(f"MAE: {mae}, RMSE: {rmse}")
def create_model_preds(all_models, X):
"""
Combine model predictions from different models using the input weights.
all_models: List[Tuple(model, preprocessor)]
List of a tuple of model info where each tuple has fitted model and data preprocessor
weights: List[]
Weights for each model in all_models (must be same length as all_models)
X:
Data to predict on
"""
combined_preds = []
for model_info in all_models:
model, preprocessor = model_info
print(f"Starting preds for {model}")
# Note: function below is defined in later section (Submission CSV Generation)
model_pred = predict_in_chunks(model, preprocessor, X)
combined_preds.append(model_pred)
return combined_preds
def combine_model_preds(combined_preds, weights, X):
# Combine based on weights
final_pred = np.empty(len(X))
print(f"Combining with weights: {weights}")
for idx, weight in enumerate(weights):
final_pred += combined_preds[idx] * weight
return final_pred
def chunker(seq, size):
"""Function to iterate over dataframe in chunks."""
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
def predict_in_chunks(model, preprocessor, df_test):
final_pred = []
for group in chunker(df_test, 300000):
group = preprocessor.transform(group)
pred = model.predict(group)
final_pred.append(pred)
del group; gc.collect()
del pred; gc.collect()
return np.concatenate(final_pred).ravel()
print('Loading Properties ...')
#properties_2016 = pd.read_csv('../input/zillow-prize-1/properties_2016.csv', low_memory = False)
properties_2016 = pd.read_csv(course_path+'properties_2016.csv', low_memory = False)
print('Loading Sample ...')
#sample_submission = pd.read_csv('../input/zillow-prize-1/sample_submission.csv', low_memory = False)
sample_submission = pd.read_csv(course_path+ 'sample_submission.csv', low_memory = False)
sample_submission['parcelid'] = sample_submission['ParcelId']
print('Concat Train 2016 & 2017 ...')
df_test = pd.merge(sample_submission[['parcelid']], properties_2016, how = 'left', on = 'parcelid')
df_test['transactiondate'] = pd.Timestamp('2016-12-01')
df_test.shape
# Models to generate predictions for
all_models = [
(ran_forest, rf_preprocessor),
(xgb_base, xgb_preprocessor),
(lgbm_base, lgbm_preprocessor),
(cb_base, cb_preprocessor),
]
# Creating Predictions for each model
combined_preds = create_model_preds(all_models, df_test) # df_test is defined in next section (Submission CSV Generation)
# Create Meta_X from base learner predictions
yhat1 = combined_preds[0].reshape((len(df_test), 1))
yhat2 = combined_preds[1].reshape((len(df_test), 1))
yhat3 = combined_preds[2].reshape((len(df_test), 1))
yhat4 = combined_preds[3].reshape((len(df_test), 1))
meta_X_submission = hstack((yhat1, yhat2, yhat3, yhat4))
meta_X_submission.shape
# Create predictions from Meta Learners using outputs from individual learners
lgbm_stacked_pred = lgbm_stacked.predict(meta_X_submission)
cb_stacked_pred = cb_stacked.predict(meta_X_submission)
meta_combined_preds = [lgbm_stacked_pred, cb_stacked_pred]
"""#### **The final model yielded in a Public Score of 0.06416 and a private score of 0.07497 which are within Top 250 and Top 60 respectively in the world out of over 3700 submissions.** """
# Models = [RF, XGB, LGBM, CB]
weights = [0.0, 0.0, 0.20, 0.55]
final_pred = combine_model_preds(combined_preds, weights, df_test)
# Models: [Stacked_LGBM, Stacked_CB]
weights = [0.25, 0.00]
print(f"Combining with weights: {weights}")
for idx, weight in enumerate(weights):
final_pred += meta_combined_preds[idx] * weight
# Create final submission CSV using final test predictions
#sub = pd.read_csv('../input/zillow-prize-1/sample_submission.csv')
sub = pd.read_csv(course_path+'sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
sub[c] = final_pred
print('Writing csv ...')
#sub.to_csv('_9_Combined_plus_Stacked.csv', index=False, float_format='%.4f')
sub.to_csv(course_path+'_9_Combined_plus_Stacked.csv', index=False, float_format='%.4f')
"""# Submission CSV Generation
The following code runs a chosen model along with the Scikit-Learn Pipeline preprocessor to generate the final submission CSV using the Kaggle Test data.
"""
print('Loading Properties ...')
#properties_2016 = pd.read_csv('../input/zillow-prize-1/properties_2016.csv', low_memory = False)
properties_2016 = pd.read_csv(course_path+'properties_2016.csv', low_memory = False)
print('Loading Sample ...')
#sample_submission = pd.read_csv('../input/zillow-prize-1/sample_submission.csv', low_memory = False)
sample_submission = pd.read_csv(course_path+ 'sample_submission.csv', low_memory = False)
sample_submission['parcelid'] = sample_submission['ParcelId']
print('Concat Train 2016 & 2017 ...')
df_test = pd.merge(sample_submission[['parcelid']], properties_2016, how = 'left', on = 'parcelid')
df_test['transactiondate'] = pd.Timestamp('2016-12-01')
df_test.shape
del properties_2016, sample_submission; gc.collect()
def chunker(seq, size):
"""Function to iterate over dataframe in chunks."""
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
def predict_in_chunks(model, preprocessor, df_test):
final_pred = []
for group in chunker(df_test, 300000):
group = preprocessor.transform(group)
pred = model.predict(group)
final_pred.append(pred)
del group; gc.collect()
del pred; gc.collect()
return np.concatenate(final_pred).ravel()
# Run the chosen model to generate predictions
model = cb_base
final_pred = predict_in_chunks(model, cb_preprocessor, df_test)
len(final_pred)
# Create final submission CSV using final test predictions
#sub = pd.read_csv('../input/zillow-prize-1/sample_submission.csv')
sub = pd.read_csv(course_path+ 'sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
sub[c] = final_pred
print('Writing csv ...')
#sub.to_csv('2_CatBoost_smaller_feat_set_best_params.csv', index=False, float_format='%.4f')
sub.to_csv(course_path+'2_CatBoost_smaller_feat_set_best_params.csv', index=False, float_format='%.4f')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment