Created
April 15, 2023 05:37
-
-
Save almerr/ebfc2de613d677b233a7b85be48e3d49 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# **To import libraries and packages** | |
""" | |
# Commented out IPython magic to ensure Python compatibility. | |
# Import necessary librairies | |
import gc | |
import numpy as np # linear algebra | |
from numpy import hstack | |
from numpy import array | |
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
import matplotlib.pyplot as plt # Matlab-style plotting | |
import seaborn as sns | |
color = sns.color_palette() | |
sns.set_style('darkgrid') | |
from scipy import stats | |
from scipy.stats import norm, skew #for some statistics | |
# Definitions | |
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points | |
pd.set_option('display.max_columns', 100) #Display upto 100 columns | |
pd.set_option('display.max_rows', 100) | |
# %matplotlib inline | |
# Check the files available in the directory | |
import os | |
for dirname, _, filenames in os.walk('../input'): | |
for filename in filenames: | |
print(os.path.join(dirname, filename)) | |
# Suppress warnings | |
import warnings | |
warnings.filterwarnings('ignore') | |
!pip install scikit-learn==1.0.0 | |
# Confirm sklearn version | |
from sklearn import __version__ | |
__version__ | |
from google.colab import drive | |
drive.mount('/content/drive') | |
#https://drive.google.com/drive/my-drive | |
#course_path = "/content/drive/My Drive/mnist/" | |
course_path = "/content/drive/MyDrive/Colab Notebooks/" | |
# To import datasets | |
#train_2016 = pd.read_csv('/content/drive/MyDrive/zillow/train_2016_v2.csv/train_2016_v2.csv', low_memory=False) | |
train_2016 = pd.read_csv(course_path + 'train_2016_v2.csv', low_memory=False) | |
#properties_2016, NAlist_2016 = reduce_mem_usage(pd.read_csv('/content/drive/MyDrive/zillow/properties_2016.csv/properties_2016.csv',low_memory=False)) | |
#properties_2016 = pd.read_csv('/content/drive/MyDrive/zillow/properties_2016.csv/properties_2016.csv',low_memory=False) | |
properties_2016 = pd.read_csv(course_path + 'properties_2016.csv',low_memory=False) | |
#train_2017 = pd.read_csv('/content/drive/MyDrive/zillow/train_2017.csv/train_2017.csv',low_memory=False) | |
train_2017 = pd.read_csv(course_path +'train_2017.csv',low_memory=False) | |
#properties_2017, NAlist_2017 = reduce_mem_usage(pd.read_csv('/content/drive/MyDrive/zillow/properties_2017.csv/properties_2017.csv',low_memory=False)) | |
#properties_2017= pd.read_csv('/content/drive/MyDrive/zillow/properties_2017.csv/properties_2017.csv',low_memory=False) | |
properties_2017= pd.read_csv(course_path + 'properties_2017.csv',low_memory=False) | |
"""# **2. To explore datasets**""" | |
data_manual = pd.read_excel(course_path+'zillow_data_dictionary.xlsx') | |
data_manual.head() | |
# to inspect dataset | |
#unique_train2016 = unique_train2016.sort_values(by='parcelid') | |
properties_2016=properties_2016.sort_values(by='parcelid') | |
properties_2016.head(15) | |
"""# To import datasets""" | |
import os | |
#HOUSING_PATH = os.path.join(course_path) | |
#PROPERTIES_2016 = 'properties_2016.csv' | |
#PROPERTIES_2017 = 'properties_2017.csv' | |
#TRAIN_2016 = 'train_2016_v2.csv' | |
#TRAIN_2017 = 'train_2017.csv' | |
#def load_housing_data(housing_path=HOUSING_PATH): | |
def load_housing_data(): | |
#properties_2016 = pd.read_csv(os.path.join(housing_path, PROPERTIES_2016)) | |
#properties_2017 = pd.read_csv(os.path.join(housing_path, PROPERTIES_2017)) | |
#train_2016 = pd.read_csv(os.path.join(housing_path, TRAIN_2016)) | |
#train_2017 = pd.read_csv(os.path.join(housing_path, TRAIN_2017)) | |
# Left join will ignore all properties that do not have a logerror (target variable) associated with them | |
train_2016 = pd.merge(train_2016, properties_2016, how = 'left', on = 'parcelid') | |
train_2017 = pd.merge(train_2017, properties_2017, how = 'left', on = 'parcelid') | |
# Union data for 2016 and 2017 into one dataframe | |
all_properties = pd.concat([properties_2016, properties_2017], ignore_index=True) | |
all_training = pd.concat([train_2016, train_2017], ignore_index=True) | |
return all_properties, all_training | |
train_2016 = pd.merge(train_2016, properties_2016, how = 'left', on = 'parcelid') | |
train_2017 = pd.merge(train_2017, properties_2017, how = 'left', on = 'parcelid') | |
# Union data for 2016 and 2017 into one dataframe | |
all_properties = pd.concat([properties_2016, properties_2017], ignore_index=True) | |
all_training = pd.concat([train_2016, train_2017], ignore_index=True) | |
#all_properties, housing = load_housing_data() | |
housing=all_training | |
housing.head() | |
print("Total Properties Shape: {}".format(all_properties.shape)) | |
print("-"*50) | |
housing.info() | |
# Check for and drop duplicates in training dataset | |
def check_duplicates(housing): | |
idsUnique = len(housing[['parcelid', 'transactiondate']].value_counts()) | |
idsTotal = housing.shape[0] | |
idsDupli = idsTotal - idsUnique | |
print("There are " + str(idsDupli) + " duplicate IDs for " + str(idsTotal) + " total entries") | |
def drop_duplicates(housing): | |
# Drop all duplicate entries which have the same parcelID and Transaction Date | |
print("Dropping all duplicates based on parcelid and transactiondate...") | |
return housing.drop_duplicates(subset=['parcelid', 'transactiondate'], keep='last', ignore_index=True) | |
# Check for and drop duplicates | |
check_duplicates(housing) | |
housing = drop_duplicates(housing) | |
# Validate | |
check_duplicates(housing) | |
"""## Target Variable | |
Target Variable : `logerror` | |
""" | |
y = housing.logerror | |
y.hist(bins=100, figsize=(8,5)) | |
plt.show() | |
y.describe() | |
sns.distplot(y , fit=norm); | |
# Get the fitted parameters used by the function | |
(mu, sigma) = norm.fit(y) | |
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) | |
#Now plot the distribution | |
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], | |
loc='best') | |
plt.ylabel('Frequency') | |
plt.title('LogError distribution') | |
#Get also the QQ-plot | |
fig = plt.figure() | |
res = stats.probplot(y, plot=plt) | |
plt.show() | |
"""### Dropping Outliers | |
To drop all outliers that are more than 2.5 standard deviations away from the mean. | |
""" | |
highest_thres = y.mean() + 2.5*y.std() | |
lowest_thres = y.mean() - 2.5*y.std() | |
print("Highest allowed",highest_thres) | |
print("Lowest allowed", lowest_thres) | |
# Only the training set outliers will be dropped (not validation or testing set to ensure model performs well on outliers too) | |
y = y[y > lowest_thres] | |
y = y[y < highest_thres] | |
# Update original Housing dataframe | |
housing = housing[housing.logerror > lowest_thres] | |
housing = housing[housing.logerror < highest_thres] | |
# Drop rows containing either 75% or more NaN Values | |
percent = 75.0 | |
min_count = int(((100-percent)/100)*housing.shape[1] + 1) | |
housing = housing.dropna(axis=0, thresh=min_count) | |
housing.shape | |
y.hist(bins=100, figsize=(8,5)) | |
plt.show() | |
#Check the new distribution | |
sns.distplot(y , fit=norm); | |
# Get the fitted parameters used by the function | |
(mu, sigma) = norm.fit(y) | |
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) | |
#Now plot the distribution | |
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], | |
loc='best') | |
plt.ylabel('Frequency') | |
plt.title('LogError distribution') | |
#Get also the QQ-plot | |
fig = plt.figure() | |
res = stats.probplot(y, plot=plt) | |
plt.show() | |
"""``` | |
# This is formatted as code | |
``` | |
The skew seems now corrected and the data appears better normally distributed. | |
# Create Test / Train Datasets | |
""" | |
from zlib import crc32 | |
from sklearn.model_selection import train_test_split | |
def test_set_check(identifier, test_ratio): | |
return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32 | |
def split_train_test_by_id(data, test_ratio, id_column): | |
ids = data[id_column] | |
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio)) | |
train_set = data.loc[~in_test_set] | |
test_set = data.loc[in_test_set] | |
X_train = train_set.drop("logerror", axis=1) | |
y_train = train_set["logerror"].copy() | |
X_test = test_set.drop("logerror", axis=1) | |
y_test = test_set["logerror"].copy() | |
return X_train, X_test, y_train, y_test | |
X_other, X_test, y_other, y_test = split_train_test_by_id(housing, 0.1, "parcelid") | |
print(f"Other Dataset Shape: {X_other.shape}; Test Dataset Shape: {X_test.shape}") | |
# Split X_other into training and validation sets | |
X_train, X_val, y_train, y_val = train_test_split(X_other, y_other, train_size=0.9, random_state=42) | |
print(f"Training Dataset Shape: {X_train.shape}") # 81% of instances are in training | |
print(f"Test Dataset Shape: {X_test.shape}") # 10% of instances are in test | |
print(f"Validation Dataset Shape: {X_val.shape}") # 9% of instances are in validation | |
# Clear memory | |
del all_properties, housing; gc.collect() | |
"""# **Data Preprocessing Pipelines** | |
### Missing Data | |
""" | |
all_data_na = (X_train.isnull().sum() / len(X_train)) * 100 | |
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False) | |
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na}) | |
missing_data[:35] | |
print ("Features with one unique value!!") | |
exclude_unique = [] | |
for c in X_train.columns: | |
num_uniques = len(X_train[c].unique()) | |
if X_train[c].isnull().sum() != 0: | |
num_uniques -= 1 | |
if num_uniques == 1: | |
exclude_unique.append(c) | |
print(exclude_unique) | |
# Training data copy to test individual pipelines | |
X_temp = X_train.copy() | |
"""## Dropping Features Pipeline""" | |
from sklearn.base import BaseEstimator, TransformerMixin | |
class FeatureDropper(BaseEstimator, TransformerMixin): | |
def __init__(self, features_to_drop): | |
self.features_to_drop = features_to_drop | |
def fit(self, X, y=None): | |
return self # nothing else to do | |
def transform(self, X): | |
updated_X = X.drop(self.features_to_drop, axis=1) | |
return updated_X | |
# Drop features | |
lin_reg_drop_vars = ["finishedsquarefeet13", "finishedsquarefeet15", "finishedfloor1squarefeet", "finishedsquarefeet50", | |
"storytypeid", "architecturalstyletypeid", "buildingclasstypeid", "typeconstructiontypeid", "finishedsquarefeet6", | |
"pooltypeid10", "pooltypeid7", "hashottuborspa", "fireplaceflag", "threequarterbathnbr", "calculatedbathnbr", | |
"fullbathcnt", "numberofstories", "rawcensustractandblock", "censustractandblock", | |
"finishedsquarefeet12", "taxvaluedollarcnt", "taxamount", "assessmentyear", "roomcnt", | |
"propertyzoningdesc", "regionidneighborhood", "regionidzip", "taxdelinquencyyear", | |
"propertycountylandusecode", "regionidcity", "parcelid", "basementsqft", "yardbuildingsqft26", "transactiondate" | |
] | |
# Sample code to test pipeline | |
feat_dropper = FeatureDropper(features_to_drop=lin_reg_drop_vars) | |
X_temp = feat_dropper.fit_transform(X_temp) | |
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
from sklearn.impute import SimpleImputer | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.preprocessing import RobustScaler | |
# Impute 0 | |
impute_0_vars = ["yardbuildingsqft17", "fireplacecnt", "poolcnt", "garagecarcnt", "garagetotalsqft", | |
"pooltypeid2", "poolsizesum", "decktypeid", "taxdelinquencyflag"] | |
# Impute mode | |
impute_mode_vars = ["airconditioningtypeid", "heatingorsystemtypeid", "unitcnt", "fips", | |
"propertylandusetypeid", "regionidcounty", "yearbuilt"] | |
# Impute median | |
impute_median_vars = ["buildingqualitytypeid", "lotsizesquarefeet", "bathroomcnt", "bedroomcnt", "calculatedfinishedsquarefeet", | |
"structuretaxvaluedollarcnt", "landtaxvaluedollarcnt", "latitude", "longitude"] | |
univariate_impute_pipe = ColumnTransformer([ | |
("impute_0", SimpleImputer(strategy="constant", fill_value=0), impute_0_vars), | |
("impute_mode", SimpleImputer(strategy="most_frequent"), impute_mode_vars), | |
("impute_median", SimpleImputer(strategy="median"), impute_median_vars), | |
], | |
remainder='passthrough' | |
) | |
# Sample code to test pipeline [ONLY RUN ONE OF UNIVARIATE OR MULTIVARIATE PIPELINES] | |
X_temp = univariate_impute_pipe.fit_transform(X_temp) | |
"""## Multivariate Imputation Pipeline | |
We will be using a `RandomForestRegressor` as the estimator to predict the missing values in each feature. For details on `Random Forest` algorithm and tuning its hyperparameters, refer to modeling section later. | |
""" | |
from sklearn.experimental import enable_iterative_imputer | |
from sklearn.impute import IterativeImputer | |
from sklearn.ensemble import RandomForestRegressor | |
cat_impute_vars = ["airconditioningtypeid", "heatingorsystemtypeid", "fips", "propertylandusetypeid", "regionidcounty", "pooltypeid2", "decktypeid", "taxdelinquencyflag"] | |
numeric_impute_vars = ["bathroomcnt", "bedroomcnt", "buildingqualitytypeid", "calculatedfinishedsquarefeet", | |
"fireplacecnt", "garagecarcnt", "garagetotalsqft", "latitude", "longitude", "lotsizesquarefeet", "poolcnt", | |
"poolsizesum", "unitcnt", "yardbuildingsqft17", "yearbuilt", "structuretaxvaluedollarcnt", "landtaxvaluedollarcnt"] | |
multivariate_impute_pipe = ColumnTransformer([ | |
("impute_cats", SimpleImputer(strategy="constant", fill_value='missing'), cat_impute_vars), | |
("impute_num", IterativeImputer(estimator=RandomForestRegressor(n_estimators=1, max_depth=30, min_samples_leaf=32), random_state=0, max_iter=1), numeric_impute_vars), | |
], | |
remainder='passthrough' | |
) | |
# Sample code to test pipeline [ONLY RUN ONE OF UNIVARIATE OR MULTIVARIATE PIPELINES] | |
# X_temp = multivariate_impute_pipe.fit_transform(X_temp) | |
"""## Column Names Appender Pipeline | |
The output of the imputation pipeline is not a Pandas DataFrame but a NumPy Array. The following pipeline takes as an input the imputation pipeline and creates a DataFrame from the Numpy Array input. | |
""" | |
class ColumnNamesAppender(BaseEstimator, TransformerMixin): | |
""" | |
Takes a Column Transformer pipeline as an input along with the Numpy Array | |
to output the DataFrame with column names appended to it. | |
""" | |
def __init__(self, column_transformer, orig_columns, num_transformers): | |
self.column_transformer = column_transformer | |
self.orig_columns = orig_columns | |
self.num_transformers = num_transformers | |
def fit(self, X, y=None): | |
return self # nothing else to do | |
def transform(self, X): | |
X_column_names = self.get_columns_from_transformer(self.column_transformer, self.orig_columns, self.num_transformers) | |
# Create dataframe from numpy array and column names | |
X = pd.DataFrame(X, columns=X_column_names) | |
return X | |
@staticmethod | |
def get_columns_from_transformer(column_transformer, input_colums, num_transformers): | |
col_name = [] | |
for transformer_in_columns in column_transformer.transformers_: #the last transformer is ColumnTransformer's 'remainder' | |
raw_col_name = transformer_in_columns[2] | |
if isinstance(transformer_in_columns[1],Pipeline): | |
transformer = transformer_in_columns[1].steps[-1][1] | |
else: | |
transformer = transformer_in_columns[1] | |
try: | |
names = transformer.get_feature_names([raw_col_name]) | |
except AttributeError: # if no 'get_feature_names' function, use raw column name | |
names = raw_col_name | |
if isinstance(names,np.ndarray): | |
col_name += names.tolist() | |
elif isinstance(names,list): | |
col_name += names | |
elif isinstance(names,str): | |
col_name.append(names) | |
return col_name | |
# Code to test pipeline (using imputation pipeline as sample ColumnTransformer pipeline) | |
# TODO: Currently unable to handle columns that are passed through remainder | |
column_appender = ColumnNamesAppender(univariate_impute_pipe, orig_columns=X_train.columns, num_transformers=3) | |
X_temp = column_appender.fit_transform(X_temp) | |
# Check if still missing values | |
all_data_na = (X_temp.isnull().sum() / len(X_temp)) * 100 | |
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:40] | |
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na}) | |
missing_data.head | |
"""## Convert Variables Types Pipeline | |
This section changes the feature data type to a more appropriate type. This is because many categorical and boolean variables are currently encoded as floats. | |
""" | |
convert_to_int = ["yearbuilt"] | |
convert_to_string= ["airconditioningtypeid", "heatingorsystemtypeid", "fips", "propertylandusetypeid", "regionidcounty", "pooltypeid2", "decktypeid", "taxdelinquencyflag"] | |
convert_to_float= ["bathroomcnt", "bedroomcnt", "buildingqualitytypeid", "calculatedfinishedsquarefeet", | |
"fireplacecnt", "garagecarcnt", "garagetotalsqft", "latitude", "longitude", "lotsizesquarefeet", "poolcnt", | |
"poolsizesum", "unitcnt", "yardbuildingsqft17", | |
"structuretaxvaluedollarcnt", "landtaxvaluedollarcnt"] | |
class ConvertFeatureType(BaseEstimator, TransformerMixin): | |
def __init__(self, convert_to_int=[], convert_to_bool=[], convert_to_string=[], convert_to_float=[]): | |
self.convert_to_int = convert_to_int | |
self.convert_to_bool = convert_to_bool | |
self.convert_to_string = convert_to_string | |
self.convert_to_float = convert_to_float | |
self.features = {"int": convert_to_int, "float": convert_to_float, "boolean": convert_to_bool, "str": convert_to_string} | |
def fit(self, X, y=None): | |
return self # Nothing else to do | |
def transform(self, X): | |
self.map_bool_features(X) | |
for data_type in self.features.keys(): | |
X = self.convert_feature_types(X, data_type) | |
return X | |
def map_bool_features(self, X): | |
"""Convert all non null values to True in bool features prior to changing type to Boolean.""" | |
for var in self.convert_to_bool: | |
X[var][X[var].notnull()] = True | |
def convert_feature_types(self, X, data_type): | |
for var in self.features[data_type]: | |
X[var] = X[var].astype(data_type) | |
return X | |
# # Code to test pipeline | |
feature_type_changer = ConvertFeatureType(convert_to_int=convert_to_int, convert_to_string=convert_to_string, convert_to_float=convert_to_float) | |
X_temp = feature_type_changer.fit_transform(X_temp) | |
class ConvertToType(BaseEstimator, TransformerMixin): | |
''' | |
Variation of pipeline above to convert ALL colummns to specific type. | |
Handy for algorithms such as XGBoost which expect all features to not be object type. | |
''' | |
def __init__(self, var_type, vars_to_convert=None): | |
self.var_type = var_type | |
self.vars_to_convert = vars_to_convert | |
def fit(self, X, y=None): | |
return self # Nothing else to do | |
def transform(self, X): | |
if self.vars_to_convert: | |
for col in self.vars_to_convert: | |
X[col] = X[col].astype(self.var_type) | |
else: | |
for col in X.columns: | |
X[col] = X[col].astype(self.var_type) | |
return X | |
# Code to test pipeline | |
# convert_to_float = ConvertToType(var_type='float', vars_to_convert=sample_vars) | |
# X_temp = convert_to_float.fit_transform(X_temp) | |
"""## Data Feature Creator Pipeline | |
This pipeline creates simple date features by extracting the information from `transactiondate` | |
""" | |
class CreateDateFeatures(BaseEstimator, TransformerMixin): | |
""" | |
Creates simple date features by extracting the information from `transactiondate` | |
""" | |
def __init__(self): | |
pass | |
def fit(self, X, y=None): | |
return self # nothing else to do | |
def transform(self, X): | |
dt = pd.to_datetime(X['transactiondate']).dt | |
X['transaction_year'] = (dt.year).astype('category') | |
X['transaction_month'] = ((dt.year - 2016)*12 + dt.month).astype('category') | |
X['transaction_day'] = dt.day | |
X['transaction_quarter'] = ((dt.year - 2016)*4 + dt.quarter).astype('category') | |
X = X.drop(['transactiondate'], axis=1) | |
return X | |
# Code to test pipeline | |
# date_feat_creator = CreateDateFeatures() | |
# X_temp = date_feat_creator.fit_transform(X_temp) | |
"""## Year Feature Creation Pipeline | |
The following transformer takes datetime features as input skewed_featsand converts them into years from present. For example: `1989` is converted into `present_year(2021) - 1989 = 32` | |
""" | |
from datetime import date | |
class CreateYearFeatures(BaseEstimator, TransformerMixin): | |
""" | |
Creates new features converting dates into years from present. | |
Eg: 1989 is converted into present_year(2021) - 1989 which is 32. | |
""" | |
def __init__(self, date_features): | |
self.date_features = date_features | |
self.current_year = date.today().year | |
def fit(self, X, y=None): | |
return self # nothing else to do | |
def transform(self, X): | |
for var in self.date_features.keys(): | |
new_var_name = self.date_features[var] | |
X[new_var_name] = self.current_year - X[var] | |
X[new_var_name] = X[new_var_name].astype('float') | |
# Drop old feature | |
X.drop(var, axis=1, inplace=True) | |
return X | |
# Date features | |
date_features = {"yearbuilt": "house_age"} | |
# Code to test pipeline | |
year_feat_creator = CreateYearFeatures(date_features=date_features) | |
X_temp = year_feat_creator.fit_transform(X_temp) | |
"""## Combining Existing Features Pipeline | |
The following transformer creates new derived variables by combining existing variables | |
""" | |
class CreateDerivedFeatures(BaseEstimator, TransformerMixin): | |
""" | |
Creates new features by combining existing variables | |
""" | |
def __init__(self): | |
return None # nothing else to do | |
def fit(self, X, y=None): | |
return self # nothing else to do | |
def transform(self, X): | |
# Average Size Features | |
X['avg_garage_size'] = X['garagetotalsqft'] / X['garagecarcnt'] | |
X['property_tax_per_sqft'] = X['taxamount'] / X['calculatedfinishedsquarefeet'] | |
# Average area in sqft per room | |
mask = (X.roomcnt >= 1) # avoid dividing by zero | |
X.loc[mask, 'avg_area_per_room'] = X.loc[mask, 'calculatedfinishedsquarefeet'] / X.loc[mask, 'roomcnt'] | |
# Derived Room Count | |
X['derived_room_cnt'] = X['bedroomcnt'] + X['bathroomcnt'] | |
# Use the derived room_cnt to calculate the avg area again | |
mask = (X.derived_room_cnt >= 1) | |
X.loc[mask,'derived_avg_area_per_room'] = X.loc[mask,'calculatedfinishedsquarefeet'] / X.loc[mask,'derived_room_cnt'] | |
# Rotated Coordinates | |
X['location_1'] = X['latitude'] + X['longitude'] | |
X['location_2'] = X['latitude'] - X['longitude'] | |
X['location_3'] = X['latitude'] + 0.5 * X['longitude'] | |
X['location_4'] = X['latitude'] - 0.5 * X['longitude'] | |
# 'finished_area_sqft' and 'total_area' cover only a strict subset of 'finished_area_sqft_calc' in terms of | |
# non-missing values. Also, when both fields are not null, the values are always the same. | |
# So we can probably drop 'finished_area_sqft' and 'total_area' since they are redundant | |
# If there're some patterns in when the values are missing, we can add two isMissing binary features | |
X['missing_finished_area'] = X['finishedsquarefeet12'].isnull().astype(float) | |
X['missing_total_area'] = X['finishedsquarefeet15'].isnull().astype(float) | |
X = X.drop(['finishedsquarefeet12', 'finishedsquarefeet15'], axis=1) | |
X['missing_bathroom_cnt_calc'] = X['calculatedbathnbr'].isnull().astype(float) | |
X = X.drop(['calculatedbathnbr'], axis=1) | |
return X | |
# Code to test pipeline | |
# derived_feat_creator = CreateDerivedFeatures() | |
# X_temp = derived_feat_creator.fit_transform(X_temp) | |
"""## Create Aggregated Features Pipeline | |
The following pipeline creates aggregated features such as aggregations for a specific zip code. | |
""" | |
class CreateAggregatedFeatures(BaseEstimator, TransformerMixin): | |
""" | |
Creates new features by combining existing variables | |
""" | |
def __init__(self, group_col, agg_cols): | |
self.group_col = group_col | |
self.agg_cols = agg_cols | |
def fit(self, X, y=None): | |
return self # nothing else to do | |
def transform(self, X): | |
group_col = self.group_col | |
X[group_col + '-groupcnt'] = X[group_col].map(X[group_col].value_counts()) | |
new_columns = [] # New feature columns added to the DataFrame | |
for col in self.agg_cols: | |
aggregates = X.groupby(group_col, as_index=False)[col].agg([np.mean]) | |
aggregates.columns = [group_col + '-' + col + '-' + s for s in ['mean']] | |
new_columns += list(aggregates.columns) | |
X = X.merge(how='left', right=aggregates, on=group_col) | |
for col in self.agg_cols: | |
mean = X[group_col + '-' + col + '-mean'] | |
diff = X[col] - mean | |
X[group_col + '-' + col + '-' + 'diff'] = diff | |
if col != 'yearbuilt': | |
X[group_col + '-' + col + '-' + 'percent'] = diff / mean | |
# Set the values of the new features to NaN if the groupcnt is too small (prevent overfitting) | |
threshold = 100 | |
X[new_columns] = X.loc[X[group_col + '-groupcnt'] < threshold, new_columns] = np.nan | |
# Drop the mean features which are not as useful | |
X = X.drop([group_col+'-'+col+'-mean' for col in self.agg_cols], axis=1) | |
return X | |
# Code to test pipeline | |
# aggregated_feat_creator = CreateAggregatedFeatures(group_col=group_col, agg_cols=agg_cols) | |
# X_temp = aggregated_feat_creator.fit_transform(X_temp) | |
"""## One-Hot Encoding Categorical Variables + Standardizing Numerical Variables | |
The following custom transformer uses `ColumnTransformer` to perform One-Hot Encoding on Categorical Features and Robust Scaler on Numerical Features. | |
""" | |
class FeatureEncoderAndScaler(BaseEstimator, TransformerMixin): | |
def __init__(self, features_to_encode=None, features_to_scale=None, numeric_types=["float"]): | |
self.features_to_encode = features_to_encode | |
self.features_to_scale = features_to_scale | |
self.numeric_types = numeric_types | |
self.feature_encoder_and_scaler = None | |
def fit(self, X, y=None): | |
if not self.features_to_encode: | |
self.features_to_encode = X.select_dtypes(include = ["object"]).columns | |
if not self.features_to_scale: | |
self.features_to_scale = X.select_dtypes(include = self.numeric_types).columns | |
feature_encoder_scaler = ColumnTransformer([ | |
("ohe_cats", OneHotEncoder(handle_unknown='ignore', sparse=False), self.features_to_encode), | |
("num_scaler", RobustScaler(), self.features_to_scale), | |
], | |
remainder='passthrough', | |
# verbose_feature_names_out='False' # To turn off prefixing of transformer name to feature | |
) | |
self.feature_encoder_scaler = feature_encoder_scaler.fit(X) | |
return self | |
def transform(self, X): | |
# OneHotEncoder returns numpy array which is converted to dataframe | |
X_np = self.feature_encoder_scaler.transform(X) | |
X = pd.DataFrame( | |
X_np, | |
columns=self.feature_encoder_scaler.get_feature_names_out() | |
) | |
X = self.convert_feature_types(X) | |
return X | |
def convert_feature_types(self, X): | |
"""Convert feature types to object, float, bool based on the column name. | |
Columns with `ohe_cats` are object, `num_scaler` are float, `remainder` are bool""" | |
for column in X: | |
if 'ohe_cats' in column: | |
X[column] = X[column].astype("object") | |
elif 'num_scaler' in column: | |
X[column] = X[column].astype("float") | |
elif 'remainder' in column: | |
X[column] = X[column].astype("boolean") | |
return X | |
# Code to test pipeline | |
# cat_features = X_test.select_dtypes(include = ["object"]).columns # optional as encoder automatically detects relevant features | |
# feature_encoder_scaler = FeatureEncoderAndScaler(features_to_encode=cat_features) | |
feature_encoder_scaler = FeatureEncoderAndScaler() | |
X_temp = feature_encoder_scaler.fit_transform(X_temp) | |
# Sample plot histogram to check standardized variable | |
X_temp.num_scaler__calculatedfinishedsquarefeet.hist(bins=50, figsize=(8,5)) | |
plt.show() | |
#Correlation map to see how features are correlated with SalePrice | |
corrmat = X_temp.corr() | |
plt.subplots(figsize=(12,9)) | |
sns.heatmap(corrmat, vmax=0.9, square=True) | |
"""## Create Polynomial Features Pipeline | |
To better detect higher degree relations between the target variable and predictors, we'll find the most important features relative to the target and create 3 new polynomial variables for each of the top 8 existing features (out of the 15 numerical features). The three new polynomial features are: | |
- `feature^2` | |
- `feature^3` | |
- `sqrt(feature)` | |
This transformation is useful for models such as linear regression compared to more complex models such as Random Forest, GBMs that can detect non-linear patters within the data without such feature engineering. | |
""" | |
# Find most important features relative to target (Take absolute value) | |
print("Find most important features relative to target") | |
corr_df = X_temp.copy() | |
corr_df["logerror"] = y_train.values | |
corr = corr_df.corr() | |
most_corr_feat = corr.logerror.abs().sort_values(ascending=False)[1:9].index | |
most_corr_feat | |
class CreatePolynomialFeatures(BaseEstimator, TransformerMixin): | |
""" | |
Creates new polynomial features using the 10 most important features relative to the target. | |
3 new polynomial variables for each of the existing features: squared, cubed, sqrt. | |
""" | |
def __init__(self, most_imp_feat): | |
self.most_imp_feat = most_imp_feat | |
def fit(self, X, y=None): | |
return self # nothing else to do | |
def transform(self, X): | |
for var in self.most_imp_feat: | |
# New var names | |
s2_var_name = var + '-s2' | |
s3_var_name = var + '-s3' | |
sq_var_name = var + '-sqrt' | |
# Create features | |
X[s2_var_name] = X[var] ** 2 | |
X[s3_var_name] = X[var] ** 3 | |
X[sq_var_name] = np.sqrt(X[var] + abs(min(X[var]))) # Translate feature to ensure min value is 0 before sqrt | |
return X | |
# Code to test pipeline | |
poly_feat_creator = CreatePolynomialFeatures(most_corr_feat) | |
X_temp = poly_feat_creator.fit_transform(X_temp) | |
# Code to detect any skewed features | |
numeric_feats = X_temp.dtypes[X_temp.dtypes == 'float'].index | |
skewed_feats = X_temp[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False) | |
skewed_feats[:20] | |
X_temp.num_scaler__lotsizesquarefeet.hist(bins=10, figsize=(8,5)) | |
plt.show() | |
from datetime import date | |
from scipy.special import boxcox1p | |
class BoxCoxSkewedFeatures(BaseEstimator, TransformerMixin): | |
""" | |
Performs Box-Cox tranformation on all numerical variables with skewness | |
above a certain threshold. | |
""" | |
def __init__(self, skewness_thres=0.75): | |
self.skewness_thres = skewness_thres | |
def fit(self, X, y=None): | |
return self # nothing else to do | |
def transform(self, X): | |
numeric_feats = X.dtypes[X.dtypes == 'float'].index | |
skewed_feats = X[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False) | |
skewed_feats = skewed_feats[abs(skewed_feats) > self.skewness_thres].index | |
# Apply box-cox to each variable | |
lam = 0.18 | |
for feat in skewed_feats: | |
X[feat] = X[feat] + abs(min(X[feat])) # Translate feature to ensure minimum value is 0 | |
X[feat] = boxcox1p(X[feat], lam) | |
return X | |
# # Code to test pipeline | |
skew_transformer = BoxCoxSkewedFeatures() | |
X_temp = skew_transformer.transform(X_temp) | |
# Check skewed features after box-cox transformation | |
numeric_feats = X_temp.dtypes[X_temp.dtypes == 'float'].index | |
skewed_feats = X_temp[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False) | |
skewed_feats[:20] | |
X_temp.num_scaler__lotsizesquarefeet.hist(bins=10, figsize=(8,5)) | |
plt.show() | |
X_temp.head() | |
# Clear memory | |
del X_temp; gc.collect() | |
X_prepared = X_train.copy() | |
X_prepared_val = X_val.copy() | |
# Feature Dropper Pipeline | |
feature_dropper = FeatureDropper(features_to_drop=lin_reg_drop_vars) | |
# Convert Date Features Pipeline | |
year_feat_creator = CreateYearFeatures(date_features=date_features) | |
# Feature Encoding and Scaling Pipeline | |
feature_encoder_scaler = FeatureEncoderAndScaler() | |
# Transform Skewed Numerical Features Pipeline | |
skew_transformer = BoxCoxSkewedFeatures() | |
# Two versions below: Univariate Imputation and Multivariate Imputation - ONLY UNCOMMENT ONE SECTION | |
########################################### | |
# 1) Univariate Imputation Pipeline | |
convert_to_bool = ["pooltypeid2", "decktypeid", "taxdelinquencyflag"] | |
convert_to_string= ["airconditioningtypeid", "heatingorsystemtypeid", "fips", "propertylandusetypeid", "regionidcounty"] | |
univariate_impute_pipe = ColumnTransformer([ | |
("impute_0", SimpleImputer(strategy="constant", fill_value=0), impute_0_vars), | |
("impute_mode", SimpleImputer(strategy="most_frequent"), impute_mode_vars), | |
("impute_median", SimpleImputer(strategy="median"), impute_median_vars), | |
], | |
remainder='passthrough' | |
) | |
col_name_appender = ColumnNamesAppender(univariate_impute_pipe, X_train.columns, num_transformers=3) | |
feature_type_changer = ConvertFeatureType(convert_to_int=convert_to_int, convert_to_string=convert_to_string, | |
convert_to_float=convert_to_float, convert_to_bool=convert_to_bool) | |
poly_feat_creator = CreatePolynomialFeatures(most_corr_feat) | |
############################################ | |
# 2) Multivariate Version | |
# convert_to_string= ["airconditioningtypeid", "heatingorsystemtypeid", "fips", "propertylandusetypeid", "regionidcounty", "pooltypeid2", "decktypeid", "taxdelinquencyflag"] | |
# multivariate_impute_pipe = ColumnTransformer([ | |
# ("impute_cats", SimpleImputer(strategy="constant", fill_value='missing'), cat_impute_vars), | |
# ("impute_num", IterativeImputer(estimator=RandomForestRegressor(n_estimators=10, max_depth=30, min_samples_leaf=32), random_state=0, max_iter=1), numeric_impute_vars), | |
# ], | |
# remainder='passthrough' | |
# ) | |
# col_name_appender = ColumnNamesAppender(multivariate_impute_pipe, X_train.columns, num_transformers=3) | |
# feature_type_changer = ConvertFeatureType(convert_to_int=convert_to_int, convert_to_string=convert_to_string, convert_to_float=convert_to_float) | |
# # most_corr_feat = list(map(lambda x: x.replace('num_scaler__',''), most_corr_feat)) # Clean up feature names by removing `num_scaler__` | |
# poly_feat_creator = CreatePolynomialFeatures(most_corr_feat) | |
lin_reg_preprocessor = Pipeline([ | |
('feature_dropper', feature_dropper), | |
('univariate_impute_pipe', univariate_impute_pipe), | |
# ('multivariate_impute_pipe', multivariate_impute_pipe), | |
('col_name_appender', col_name_appender), | |
('feature_type_changer', feature_type_changer), | |
('year_feat_creator', year_feat_creator), | |
('feature_encoder_scaler', feature_encoder_scaler), | |
('poly_feat_creator', poly_feat_creator), | |
('skew_transformer', skew_transformer), | |
]) | |
data_prep_pipe = lin_reg_preprocessor.fit(X_prepared) | |
X_prepared = lin_reg_preprocessor.transform(X_prepared) | |
X_prepared_val = lin_reg_preprocessor.transform(X_prepared_val) | |
from sklearn import set_config | |
set_config(display='diagram') | |
lin_reg_preprocessor | |
"""# Select and Train Models | |
**As this project is primarily for learning purposes, candidate machine learning algorithms will be analyzed in greater detail before being rejected to better understand the algorithms for future usage.** | |
""" | |
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet | |
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV | |
from sklearn.model_selection import cross_val_score | |
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer | |
lin_reg = LinearRegression() | |
lin_reg.fit(X_prepared, y_train) | |
# let's try the full preprocessing pipeline on a few training instances | |
some_data = X_train.iloc[:5] | |
some_labels = y_train.iloc[:5] | |
some_data_prepared = data_prep_pipe.transform(some_data) | |
print("Predictions:", lin_reg.predict(some_data_prepared)) | |
print("Labels:", list(some_labels)) | |
# Baseline for RMSE | |
print(f"MAE Baseline: {y_train.mad()}") | |
print(f"RMSE Baseline: {y_train.std()}") | |
def get_eval_metrics(models, X, y_true): | |
""" | |
Calculates MAE (Mean Absoulate Error) and RMSE (Root Mean Squared Error) on the data set for input models. | |
`models`: list of fit models | |
""" | |
for model in models: | |
y_pred= model.predict(X) | |
rmse = mean_squared_error(y_true, y_pred, squared=False) | |
mae = mean_absolute_error(y_true, y_pred) | |
print(f"Model: {model}") | |
print(f"MAE: {mae}, RMSE: {rmse}") | |
# Test usage of RMSE function | |
# get_eval_metrics([lin_reg, ridge_reg, lasso_reg], X_prepared_val, y_val) | |
def display_scores(model, scores): | |
print("-"*50) | |
print("Model:", model) | |
print("\nScores:", scores) | |
print("\nMean:", scores.mean()) | |
print("\nStandard deviation:", scores.std()) | |
def get_cross_val_scores(models, X, y, cv=10, fit_params=None): | |
""" | |
Performs k-fold cross validation and calculates MAE for each fold for all input models. | |
`models`: list of fit models | |
""" | |
for model in models: | |
mae = -cross_val_score(model, X, y, scoring="neg_mean_absolute_error", cv=cv, fit_params=fit_params) | |
display_scores(model, mae) | |
# Test usage of cross val function | |
# get_cross_val_scores([lin_reg, ridge_reg], X_prepared, y_train, cv=5) | |
# Linear Regression | |
lin_reg = LinearRegression() | |
lin_reg.fit(X_prepared, y_train) | |
get_eval_metrics([lin_reg], X_prepared_val, y_val) | |
get_cross_val_scores([lin_reg], X_prepared, y_train, cv=5) | |
from sklearn.metrics import mean_squared_error | |
from sklearn.model_selection import train_test_split | |
def plot_learning_curves(model, X_train, y_train, X_val, y_val): | |
""" | |
Train the input model on different sized subsets and test on validation set. | |
Output a plot of training and validation error for the different sized subsets. | |
""" | |
train_errors, val_errors = [], [] | |
num_instances = np.linspace(1, len(X_train), num=15).astype(int) | |
for m in num_instances: | |
model.fit(X_train[:m], y_train[:m]) | |
y_train_predict = model.predict(X_train[:m]) | |
y_val_predict = model.predict(X_val) | |
train_errors.append(mean_absolute_error(y_train[:m], y_train_predict)) | |
val_errors.append(mean_absolute_error(y_val, y_val_predict)) | |
plt.plot(num_instances, train_errors, "r-+", linewidth=2, label="train") | |
plt.plot(num_instances, val_errors, "b-", linewidth=3, label="val") | |
plt.legend(loc='best') | |
plt.title(model) | |
return plt | |
lin_reg = LinearRegression() | |
plt = plot_learning_curves(lin_reg, X_prepared, y_train, X_prepared_val, y_val) | |
plt.ylim(0, 3000) | |
plt.show() | |
# Set of alpha values to test | |
alphas = np.logspace(0,2,20) | |
print(f"Testing with alphas={alphas}") | |
# Tune Ridge Regression | |
ridgecv = RidgeCV(alphas=alphas) | |
ridgecv.fit(X_prepared, y_train) | |
print(f"Best Ridge Alpha: {ridgecv.alpha_}") | |
# Set of alpha values to test | |
alphas = np.logspace(1,5,20) | |
print(f"Testing with alphas={alphas}") | |
# Tune Lasso Regression | |
lassocv = LassoCV(alphas=alphas) | |
lassocv.fit(X_prepared, y_train) | |
print(f"Best Lasso Alpha: {lassocv.alpha_}") | |
"""**The extremely high value of `alpha` for Lasso regression above will regularize the model heavily and push coefficients to zero.**""" | |
# Fit using optimal alpha | |
ridge = Ridge(alpha=ridgecv.alpha_) | |
ridge.fit(X_prepared, y_train) | |
lasso = Lasso(alpha=lassocv.alpha_) | |
lasso.fit(X_prepared, y_train) | |
# get_eval_metrics([ridge, lasso], X_prepared_val, y_val) | |
get_cross_val_scores([ridge, lasso], X_prepared, y_train, cv=5) | |
# Learning Curves for Ridge and Lasso | |
ridge = Ridge(alpha=ridgecv.alpha_) | |
plt = plot_learning_curves(ridge, X_prepared, y_train, X_prepared_val, y_val) | |
plt.show() | |
lasso = Lasso(alpha=lassocv.alpha_) | |
plt = plot_learning_curves(lasso, X_prepared, y_train, X_prepared_val, y_val) | |
plt.show() | |
"""**As expected, even the regularized Linear Models do not perform any better. Thus, we need to consider more complex algorithms which make fewer assumptions to try and understand the relationship between the features and target variable.** """ | |
X_prepared = X_train.copy() | |
X_prepared_val = X_val.copy() | |
# Feature Dropper Pipeline | |
rf_drop_vars = ['parcelid'] | |
feature_dropper = FeatureDropper(features_to_drop=rf_drop_vars) | |
# Create Date Features | |
date_feat_creator = CreateDateFeatures() | |
final_column_names = list(X_train.columns) + ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter'] | |
# Univariate Impute Pipeline | |
rf_impute_median_vars = [ | |
"lotsizesquarefeet", "bathroomcnt", "bedroomcnt", "calculatedfinishedsquarefeet", "structuretaxvaluedollarcnt", | |
"landtaxvaluedollarcnt", "latitude", "longitude", "finishedsquarefeet13", "finishedsquarefeet15", | |
"finishedfloor1squarefeet", "finishedsquarefeet50", "finishedsquarefeet6", "threequarterbathnbr", | |
"calculatedbathnbr", "fullbathcnt", "numberofstories", "finishedsquarefeet12", "taxvaluedollarcnt", | |
"taxamount", "roomcnt", "basementsqft", "yardbuildingsqft26", "unitcnt" | |
] | |
rf_impute_0_vars = ["yardbuildingsqft17", "fireplacecnt", "poolcnt", "garagecarcnt", "garagetotalsqft", "poolsizesum"] | |
rf_cat_vars = [ | |
'airconditioningtypeid', 'architecturalstyletypeid', 'buildingclasstypeid', 'buildingqualitytypeid', | |
'decktypeid', 'fips', 'hashottuborspa', 'heatingorsystemtypeid', 'pooltypeid10', 'pooltypeid2', | |
'pooltypeid7', 'propertycountylandusecode', 'propertylandusetypeid', 'propertyzoningdesc', | |
'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', | |
'storytypeid', 'typeconstructiontypeid', 'fireplaceflag', 'assessmentyear', | |
'taxdelinquencyflag', 'taxdelinquencyyear', 'censustractandblock', 'yearbuilt', | |
'transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', | |
] | |
univariate_impute_pipe = ColumnTransformer([ | |
("rf_impute_cats", SimpleImputer(strategy="constant", fill_value="missing"), rf_cat_vars), | |
("rf_impute_0_vars", SimpleImputer(strategy="constant", fill_value=0), rf_impute_0_vars), | |
("rf_impute_median", SimpleImputer(strategy="median"), rf_impute_median_vars), | |
], | |
remainder='passthrough' | |
) | |
# Column Names Appender | |
col_name_appender = ColumnNamesAppender(univariate_impute_pipe, final_column_names, num_transformers=3) | |
# Convert to Cat + One-Hot Encoding | |
convert_to_cat = ConvertToType(var_type='str', vars_to_convert=rf_cat_vars) | |
feature_encoder = ColumnTransformer([ | |
("ohe_cats", OneHotEncoder(handle_unknown='ignore'), rf_cat_vars) | |
], | |
remainder='passthrough' | |
) | |
rf_preprocessor = Pipeline([ | |
('date_feat_creator', date_feat_creator), | |
('feature_dropper', feature_dropper), | |
('univariate_impute_pipe', univariate_impute_pipe), | |
('col_name_appender', col_name_appender), | |
('convert_to_cat', convert_to_cat), | |
('feature_encoder', feature_encoder), | |
]) | |
data_prep_pipe = rf_preprocessor.fit(X_prepared) | |
X_prepared = rf_preprocessor.transform(X_prepared) | |
X_prepared_val = rf_preprocessor.transform(X_prepared_val) | |
X_rf = X_prepared.copy() | |
X_rf_val = X_prepared_val.copy() | |
"""### Random Search + Grid Search Results | |
Below are the results from using Random Search in combination with Grid Search to arrive at the optimized hyperparameters. Sample code for each method is shown in cells below. | |
1) ``Random Search with max_samples=0.1`` | |
Random Grid with 20 different combinations = | |
'max_depth': [5, 16, 27, 38, 50, None], | |
'max_features': ['auto', 0.6, 0.7, 0.8], | |
'min_samples_leaf': [32, 64, 128, 256], | |
Best Params = | |
{'max_depth': 50, | |
'max_features': 0.6, | |
'min_samples_leaf': 32,} | |
2) ``Grid Search with max_samples=0.1`` | |
Grid = | |
{'max_features': ['auto', 0.5, 0.6, 0.8, 0.9], | |
'max_depth': [10, 40, 70, 100, None], | |
'min_samples_leaf': [16, 32, 64],} | |
Best Params: | |
{'max_depth': 70, | |
'max_features': 0.6, | |
'min_samples_leaf': 32,} | |
3) ``Final Grid Search``: Tune ``n_estimators`` and ``max_samples`` | |
Grid = | |
{'max_features': [0.6], | |
'max_depth': [70], | |
'min_samples_leaf': [32] | |
'max_samples': [0.1, 0.3, 0.5], | |
'n_estimators': [100, 400, 600]} | |
Best Params: | |
{'n_estimators': 400, | |
'max_samples': 0.3, | |
'max_depth': 70, | |
'max_features': 0.6, | |
'min_samples_leaf': 32,} | |
**Given that Randon Forest are already great out-of-the-box and the increasingly long training times, there is no benefit in further optimizing the hyperparameters for the Random Forest Regressor.** | |
""" | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.model_selection import RandomizedSearchCV | |
from sklearn.model_selection import GridSearchCV | |
# Fraction of original dataset given to any individual tree | |
max_samples = [0.1, 0.2] | |
# Number of features to consider at every split | |
#max_features = ['auto', 0.4, 0.6, 0.8, 0.9] | |
max_features = [0.4, 0.8] | |
# Maximum number of levels in tree | |
#max_depth = [int(x) for x in np.linspace(50, 100, num = 3)] | |
max_depth = [int(x) for x in np.linspace(20, 60, num = 3)] | |
# max_depth.append(None) | |
# Minimum number of samples required at each leaf node | |
min_samples_leaf = [16, 32, 48] | |
# Create the tuning grid | |
param_grid = { | |
'max_samples': max_samples, | |
'max_features': max_features, | |
'max_depth': max_depth, | |
'min_samples_leaf': min_samples_leaf, | |
} | |
###### Random Grid Sample Code ####### | |
# Use the random grid to narrow down best hyperparameters | |
# First create the base model to tune | |
rf = RandomForestRegressor(max_samples=0.1) | |
# Random search of parameters, using 3 fold cross validation, | |
# search across 20 different combinations, and use all available cores | |
#rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1) | |
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=3, cv=3, verbose=2, random_state=42, n_jobs=-1) | |
# Fit the random search model | |
rf_random.fit(X_prepared, y_train) | |
###### Grid Search Sample Code ####### | |
# Use the Grid search to further narrown down best hyperparameters | |
# First create the base model to tune | |
rf = RandomForestRegressor() | |
# Grid search of parameters, using 3 fold cross validation | |
rf_grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2) | |
# Fit the random search model | |
rf_grid.fit(X_prepared, y_train) | |
# Output the best estimator from grid search and it's hyperparameters | |
best_rf = rf_grid.best_estimator_ | |
print(best_rf) | |
#best_rf.best_params_ | |
# BEWARE: With high max_samples and 400 n_estimators, this code takes almost hour to run | |
best_params = { | |
#'n_estimators': 400, | |
'n_estimators': 100, | |
#'max_samples': 0.3, | |
'max_samples': 0.2, | |
#'max_depth': 70, | |
'max_depth': 40, | |
#'max_features': 0.6, | |
'max_features': 0.4, | |
#'min_samples_leaf': 32, | |
'min_samples_leaf': 16, | |
'random_state': 42, | |
} | |
ran_forest = RandomForestRegressor(**best_params) | |
ran_forest.fit(X_prepared, y_train) | |
get_eval_metrics([ran_forest], X_prepared_val, y_val) | |
# get_cross_val_scores([ran_forest], X_prepared, y_train, cv=3) | |
# WARNING: With high max_samples and high n_estimators, this code takes almost 2 hours to run | |
from sklearn.ensemble import ExtraTreesRegressor | |
# Using the best parameters from Random Forest Regressor | |
best_params = { | |
'n_estimators': 100, | |
'max_samples': 0.3, | |
'max_depth': 70, | |
'max_features': 0.6, | |
'min_samples_leaf': 32, | |
'random_state': 42, | |
} | |
extra_trees_reg = ExtraTreesRegressor(**best_params) | |
extra_trees_reg.fit(X_prepared, y_train) | |
# get_cross_val_scores([extra_trees_reg], X_prepared, y_train, cv=3) | |
# Linear SVM Regression | |
# Trains extremely quickly but not useful as the Zillow dataset is not linearly separable | |
from sklearn.svm import LinearSVR | |
svm_reg_linear = LinearSVR(random_state=42) | |
svm_reg_linear.fit(X_prepared, y_train) | |
# SVM with Polynomial Kernel - 'transforms' data points to polynomial space using 'kernel trick' | |
from sklearn.svm import SVR | |
svm_poly_reg = SVR(kernel="poly") | |
svm_poly_reg.fit(X_prepared, y_train) | |
# SVM with Radial Basis Function (RBF) Kernel | |
rbf_svm = SVR(kernel = 'rbf') | |
rbf_svm.fit(X_prepared, y_train) | |
get_eval_metrics([svm_reg_linear, svm_poly_reg, rbf_svm], X_prepared_val, y_val) | |
"""# 4) Gradient Boosting Machines | |
# 4-a) XGBoost | |
Even though Scikit-Learn has its own implementation of Gradient Boost, it lacks in both performance and speed compared to other implementations. Thus, we will skip Scikit-Learn's implementation in favor of XGBoost, LightGBM, CatBoost. | |
### Missing Data | |
One of most powerful capabilities of Gradient Boosting machines is their ability to handle missing data. They do not require data imputation and are designed to extract as much information as possible from the rows with missing data. | |
""" | |
import xgboost as xgb | |
from sklearn.model_selection import cross_val_score | |
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer | |
X_prepared = X_train.copy() | |
X_prepared_val = X_val.copy() | |
# Data Prep for XGBoost with Original Data | |
# Feature Dropper Pipeline | |
xgb_drop = ['parcelid'] | |
feature_dropper = FeatureDropper(features_to_drop=xgb_drop) | |
# Convert Year Features Pipeline | |
year_feat_creator = CreateYearFeatures(date_features=date_features) | |
# Date Feature Creator | |
date_feat_creator = CreateDateFeatures() | |
# Feature Encoding Pipeline | |
cat_vars = ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', | |
'airconditioningtypeid', 'architecturalstyletypeid', 'buildingclasstypeid', 'buildingqualitytypeid', | |
'decktypeid', 'fips', 'hashottuborspa', 'heatingorsystemtypeid', 'pooltypeid10', 'pooltypeid2', | |
'pooltypeid7', 'propertycountylandusecode', 'propertylandusetypeid', 'propertyzoningdesc', | |
'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', | |
'storytypeid', 'typeconstructiontypeid', 'fireplaceflag', 'assessmentyear', | |
'taxdelinquencyflag', 'taxdelinquencyyear', 'censustractandblock', | |
] | |
feature_encoder = ColumnTransformer([ | |
("ohe_cats", OneHotEncoder(handle_unknown='ignore'), cat_vars) | |
], | |
remainder='passthrough' | |
) | |
xgb_preprocessor = Pipeline([ | |
('feature_dropper', feature_dropper), | |
('date_feat_creator', date_feat_creator), | |
('year_feat_creator', year_feat_creator), | |
('feature_encoder', feature_encoder), | |
]) | |
data_prep_pipe = xgb_preprocessor.fit(X_prepared) | |
X_prepared = xgb_preprocessor.transform(X_prepared) | |
X_prepared_val = xgb_preprocessor.transform(X_prepared_val) | |
X_xgb = X_prepared.copy() | |
X_xgb_val = X_prepared_val.copy() | |
# Initialize XGBoost Model | |
params = { | |
'learning_rate': 0.3, | |
'n_estimators': 10000, | |
'random_state': 42, | |
} | |
xgb_base = xgb.XGBRegressor(**params) | |
# Fit model using early stopping and validation set | |
fit_params={'early_stopping_rounds': 10, | |
'eval_metric': 'mae', | |
'eval_set': [[X_prepared_val, y_val]]} | |
xgb_base.fit(X_prepared, y_train, **fit_params) | |
fit_params={'early_stopping_rounds': 10, | |
'eval_metric': 'mae', | |
'verbose': False, | |
'eval_set': [[X_prepared_val, y_val]]} | |
# get_eval_metrics([xgb_base], X_prepared_val, y_val) | |
get_cross_val_scores([xgb_base], X_prepared, y_train, cv=3, fit_params=fit_params) | |
fit_params={ | |
'early_stopping_rounds': 10, | |
'eval_metric': 'mae', | |
'verbose': False, | |
'eval_set': [[X_prepared_val, y_val]], | |
} | |
other_params = { | |
'learning_rate': 0.2, | |
'n_estimators': 10000, | |
'random_state': 42, | |
} | |
def param_exploration(param_name, param_vals, other_params=other_params, fit_params=fit_params, cross_val=False): | |
""" | |
Function to explore the impact of different values for ONE parameter on MAE. | |
Tested using either single validation test set or cross validation (cross_val=True) | |
""" | |
mae_vec = np.zeros(len(param_vals)) | |
for i, val in enumerate(param_vals): | |
params[param_name] = val | |
xgb_temp = xgb.XGBRegressor(**params) | |
if cross_val: | |
# TESTING WITH CROSS VALIDATION | |
mae = -cross_val_score(xgb_temp, X_prepared, y_train, scoring="neg_mean_absolute_error", cv=3, fit_params=fit_params) | |
mae_vec[i] = mae.mean() | |
else: | |
# TESTING WITH SINGLE VALIDATION SET | |
xgb_temp.fit(X_prepared, y_train, **fit_params) | |
preds = xgb_temp.predict(X_prepared_val) | |
mae_vec[i] = mean_absolute_error(y_val, preds) | |
plt.plot(param_vals, mae_vec) | |
# Max Depth Tuning | |
# From initial testing, cross validation was necessary when testing the different values as the model was starting to overfit on the single validation set. | |
# BEWARE - TAKES 5-10 MINUTES TO RUN | |
max_depth_vals_vec=list(range(1,7)) | |
param_exploration(param_name="max_depth", param_vals=max_depth_vals_vec, cross_val=True) | |
"""## Manually Exploring Parameters (One by One) | |
**Warning**: Each of the plots generated below take a few minutes to run. | |
""" | |
# Subsample: Fraction of dataset to use to build each tree | |
param_vals_vec=[0.3, 0.5, 0.6, 0.7, 0.8, 0.9, .95, 1] | |
param_exploration(param_name="subsample", param_vals=param_vals_vec, cross_val=False) | |
# ColSample By Tree: Ration of columns chosen when constructing each tree | |
param_vals_vec=[0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1] | |
param_exploration(param_name="colsample_bytree", param_vals=param_vals_vec, cross_val=False) | |
# ColSample By Node: Ratio of columns chosen at each node (split) | |
param_vals_vec=[0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1] | |
param_exploration(param_name="colsample_bynode", param_vals=param_vals_vec, cross_val=False) | |
# Reg Lambda: Regularization parameter from Ridge Regression, the larger the value, the more conservative the algorithm | |
param_vals_vec=[1, 2, 3, 5, 10, 20] | |
param_exploration(param_name="reg_lambda", param_vals=param_vals_vec, cross_val=False) | |
# Reg Alpha: Regularization parameter from Lasso Regression, the larger the value, the more conservative the algorithm | |
param_vals_vec=[1, 2, 3, 5, 10, 20] | |
param_exploration(param_name="reg_alpha", param_vals=param_vals_vec, cross_val=False) | |
# Gamma: Another regularization paramter - the larger the value, the more conservative the algorithm | |
param_vals_vec=[0.1, 0.3, 0.5, 0.7, 1, 2] | |
param_exploration(param_name="gamma", param_vals=param_vals_vec, cross_val=False) | |
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials | |
"""Must define the "loss function" which takes a set of parameters and output the loss value""" | |
def eval_model(params): | |
xgb_base = xgb.XGBRegressor() | |
xgb_base.set_params(**params) | |
# TESTING WITH SINGLE VALIDATION SET | |
# xgb_base.fit(X_prepared, y_train, eval_set=[(X_prepared_val, y_val)], early_stopping_rounds=20, | |
# eval_metric='mae', verbose=False) | |
# preds = xgb_base.predict(X_prepared_val) | |
# mae = mean_absolute_error(y_val, preds) | |
# TESTING WITH CROSS VALIDATION | |
fit_params={'early_stopping_rounds': 15, | |
'eval_metric': 'mae', | |
'verbose': False, | |
'eval_set': [[X_prepared_val, y_val]]} | |
mae = -cross_val_score(xgb_base, X_prepared, y_train, scoring="neg_mean_absolute_error", cv=3, fit_params=fit_params) | |
mae = mae.mean() | |
return(mae) | |
"""- Next, we define the parameter space. | |
- There are several options to define ranges. | |
- `randint`: chooses a random integer between 0 and an upper bound | |
- `choice`: choose from a set of specific values | |
- `uniform`: choose a (float) in the range | |
- `quniform`: choose a discrete range of floats | |
See more at hyperopt documentation: | |
http://hyperopt.github.io/hyperopt/getting-started/search_spaces/ | |
""" | |
fspace1 = { | |
'max_depth':hp.randint('max_depth', 7), # goes up to 6-1 | |
'colsample_bynode': hp.uniform('colsample_bynode',.3,1), | |
'subsample': hp.quniform('subsample',.3, 1,.05), | |
'gamma':hp.uniform('gamma',.2, 5), | |
'tree_method': hp.choice('tree_method', ['approx', 'exact']), | |
'n_estimators': 10000, | |
'learning_rate': 0.15, | |
'random_state': 42, | |
} | |
# Finally, we use the `fmin` function and record our trials in a `Trials` object | |
# BEWARE: THIS CODE TOOK 7+ HOURS TO RUN WHEN PERFORMING 1200 TRIALS | |
trials1 = Trials() | |
best_param = fmin(fn=eval_model, space=fspace1, algo=tpe.suggest, max_evals=10, trials=trials1) | |
best_param | |
"""The individual trials can be explored to detect trends within specific parameters | |
""" | |
# Extract data from hyperopt trials | |
max_depth_trial_vals = np.array([t['misc']['vals']['max_depth'] for t in trials1.trials]) | |
ss_trial_vals = np.array([t['misc']['vals']['subsample'] for t in trials1.trials]) | |
gamma_trial_vals = np.array([t['misc']['vals']['gamma'] for t in trials1.trials]) | |
loss_trial_vals = np.array([t['result']['loss'] for t in trials1.trials]) | |
trial_nums = np.array([t['tid'] for t in trials1.trials]) | |
"""Plot trends between different parameter values and corresponding losses. | |
For example, the ``max_depth`` plot shows decreasing losses with an increase in depth which plateaus are a certain value - due to Kaggle notebook refreshing at startup, the plot for 1200 trials experiment in unavailable anymore :( | |
""" | |
# Plot trends between different parameter values and corresponding losses | |
plt.scatter(max_depth_trial_vals, loss_trial_vals, alpha=.2) | |
# plt.scatter(ss_trial_vals, loss_trial_vals, alpha=.2) | |
# plt.scatter(gamma_trial_vals, loss_trial_vals, alpha=.2) | |
"""### Best Params Model | |
Based on the 1200 ``hyperopt`` trials, the following hyperparameter values had the best loss: | |
``` | |
{'colsample_bynode': 0.382, | |
'gamma': 0.201, | |
'max_depth': 5, | |
'subsample': 0.95, | |
'tree_method': 'exact'} | |
``` | |
The other important parameters (``learning_rate``, ``n_estimators``) were already set. | |
""" | |
fit_params={ | |
'early_stopping_rounds': 15, | |
'eval_metric': 'mae', | |
'verbose': False, | |
'eval_set': [[X_prepared_val, y_val]], | |
} | |
best_params = { | |
'colsample_bynode': 0.75, | |
'gamma': 0.201, | |
'max_depth': 5, | |
'subsample': 0.95, | |
'tree_method': 'exact', | |
'learning_rate': 0.01, | |
'n_estimators': 10000, | |
'random_state': 42, | |
} | |
xgb_base = xgb.XGBRegressor(**best_params) | |
xgb_base.fit(X_prepared, y_train, **fit_params) | |
get_cross_val_scores([xgb_base], X_prepared, y_train, cv=3, fit_params=fit_params) | |
!pip install optuna | |
import lightgbm as lgbm | |
from sklearn.model_selection import cross_val_score, KFold | |
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer | |
import optuna # !pip install optuna | |
from optuna.integration import LightGBMPruningCallback | |
X_prepared = X_train.copy() | |
X_prepared_val = X_val.copy() | |
class ConvertToCategorical(BaseEstimator, TransformerMixin): | |
''' | |
Pipeline to convert categorical variables to Pandas Category type. | |
This pipeline is specific to LGBM which handles categorical vars differently. | |
''' | |
def __init__(self, cat_vars): | |
self.cat_vars = cat_vars | |
def fit(self, X, y=None): | |
return self # Nothing else to do | |
def transform(self, X): | |
for col in self.cat_vars: | |
X[col] = pd.Categorical(X[col]) | |
return X | |
# Code to test pipeline | |
# convert_to_cat = ConvertToCategorical(cat_vars=cat_vars) | |
# X_temp = convert_to_cat.fit_transform(X_temp) | |
# Feature Dropper Pipeline | |
var_drop = ['parcelid',] | |
feature_dropper = FeatureDropper(features_to_drop=var_drop) | |
# Convert Date Features Pipeline | |
year_feat_creator = CreateYearFeatures(date_features=date_features) | |
# Date Feature Creator | |
date_feat_creator = CreateDateFeatures() | |
# Derived Features Creator | |
derived_feat_creator = CreateDerivedFeatures() | |
# Feature Encoding Pipeline | |
cat_vars = ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', | |
'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', | |
'regionidneighborhood', 'regionidzip', 'assessmentyear', 'typeconstructiontypeid', 'architecturalstyletypeid', | |
'buildingclasstypeid', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'hashottuborspa', 'pooltypeid2', | |
'taxdelinquencyyear', 'taxdelinquencyflag', 'fireplaceflag', 'decktypeid', 'pooltypeid10', | |
'propertycountylandusecode', 'propertyzoningdesc', 'rawcensustractandblock', 'censustractandblock' | |
] | |
# Feature Encoding Pipeline | |
feature_encoder = ColumnTransformer([ | |
("ohe_cats", OneHotEncoder(handle_unknown='ignore'), cat_vars)], | |
remainder='passthrough' | |
) | |
lgbm_preprocessor = Pipeline([ | |
('derived_feat_creator', derived_feat_creator), | |
('feature_dropper', feature_dropper), | |
('date_feat_creator', date_feat_creator), | |
('year_feat_creator', year_feat_creator), | |
('feature_encoder', feature_encoder), | |
]) | |
data_prep_pipe = lgbm_preprocessor.fit(X_prepared) | |
X_prepared = lgbm_preprocessor.transform(X_prepared) | |
X_prepared_val = lgbm_preprocessor.transform(X_prepared_val) | |
X_lgbm = X_prepared.copy() | |
X_lgbm_val = X_prepared_val.copy() | |
# Initialize LightGBM Model | |
params = { | |
'n_estimators': 10000, | |
'random_state': 42 | |
} | |
# Fit model using early stopping and validation set | |
fit_params={'early_stopping_rounds': 30, | |
'eval_metric': 'mae', | |
'verbose': False, | |
'eval_set': [[X_prepared_val, y_val]] | |
} | |
lgbm_base = lgbm.LGBMRegressor(**params) | |
lgbm_base.fit(X_prepared, y_train, **fit_params) | |
get_cross_val_scores([lgbm_base], X_prepared, y_train, cv=3, fit_params=fit_params) | |
def objective(trial, X, y): | |
param_grid = { | |
"n_estimators": trial.suggest_categorical("n_estimators", [10000]), | |
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3), | |
"num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20), | |
"max_depth": trial.suggest_int("max_depth", 3, 12), | |
"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100), | |
"lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5), | |
"lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5), | |
"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15), | |
"bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95, step=0.1), | |
"bagging_freq": trial.suggest_categorical("bagging_freq", [1]), | |
"feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95, step=0.1), | |
} | |
cv = KFold(n_splits=5, shuffle=True, random_state=42) | |
cv_scores = np.empty(5) | |
for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)): | |
#X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] | |
X_train, X_test = X[train_idx], X[test_idx] | |
y_train, y_test = y[train_idx], y[test_idx] | |
model = lgbm.LGBMRegressor(**param_grid) | |
model.fit( | |
X_train, | |
y_train, | |
eval_set=[(X_test, y_test)], | |
eval_metric="mae", | |
verbose=False, | |
early_stopping_rounds=25, | |
callbacks=[ | |
LightGBMPruningCallback(trial, "l1") | |
], # Add a pruning callback | |
) | |
preds = model.predict(X_test) | |
cv_scores[idx] = mean_absolute_error(y_test, preds) | |
return np.mean(cv_scores) | |
# Similar to Hyperopt - this code creates an experiment using the objective function and parameter grid above | |
study = optuna.create_study(direction="minimize", study_name="LGBM Regressor") | |
func = lambda trial: objective(trial, X_prepared, y_train.to_numpy()) | |
study.optimize(func, n_trials=100) | |
print(f"\tBest value (mae): {study.best_value:.5f}") | |
print(f"\tBest params:") | |
for key, value in study.best_params.items(): | |
print(f"\t\t{key}: {value}") | |
"""### Best Params Model | |
Based on the 10000 ``Optuna`` trials, the following hyperparameter values had the best loss: | |
``` | |
best_params = { | |
'num_leaves': 512, | |
'max_depth': 9, | |
'lambda_l2': 15, | |
'bagging_fraction': 0.9, | |
'feature_fraction': 0.6, | |
'learning_rate': 0.01, | |
'n_estimators': 10000, | |
'random_state': 42, | |
} | |
``` | |
The other important parameters (``learning_rate``, ``n_estimators``) were already set. | |
""" | |
fit_params={ | |
'early_stopping_rounds': 30, | |
'eval_metric': 'mae', | |
'verbose': False, | |
'eval_set': [[X_prepared_val, y_val]] | |
} | |
best_params = { | |
'num_leaves': 512, | |
'max_depth': 9, | |
'lambda_l2': 15, | |
'bagging_fraction': 0.9, | |
'feature_fraction': 0.6, | |
'learning_rate': 0.01, | |
'n_estimators': 10000, | |
'random_state': 42, | |
} | |
lgbm_base = lgbm.LGBMRegressor(**best_params) | |
lgbm_base.fit(X_prepared, y_train, **fit_params) | |
get_cross_val_scores([lgbm_base], X_prepared, y_train, cv=3, fit_params=fit_params) | |
!pip install catboost | |
import catboost as cb | |
import optuna | |
from sklearn.model_selection import cross_val_score, KFold | |
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer | |
X_prepared = X_train.copy() | |
X_prepared_val = X_val.copy() | |
# Feature Dropper Pipeline | |
var_drop = ['parcelid',] | |
feature_dropper = FeatureDropper(features_to_drop=var_drop) | |
# Convert Date Features Pipeline | |
year_feat_creator = CreateYearFeatures(date_features=date_features) | |
# Feature Encoding Pipeline | |
cat_vars = ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', | |
'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', | |
'regionidneighborhood', 'regionidzip', 'assessmentyear', 'typeconstructiontypeid', 'architecturalstyletypeid', | |
'buildingclasstypeid', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'hashottuborspa', 'pooltypeid2', | |
'taxdelinquencyyear', 'taxdelinquencyflag', 'fireplaceflag', 'decktypeid', 'pooltypeid10', | |
'propertycountylandusecode', 'propertyzoningdesc', 'rawcensustractandblock', 'censustractandblock' | |
] | |
convert_to_cat = ConvertToType(var_type='str', vars_to_convert=cat_vars) | |
# Date Feature Creator | |
date_feat_creator = CreateDateFeatures() | |
# Derived Features Creator | |
derived_feat_creator = CreateDerivedFeatures() | |
# Aggregated Feature Creator | |
group_col = 'regionidcity' | |
agg_cols = ['lotsizesquarefeet', 'yearbuilt', 'calculatedfinishedsquarefeet', | |
'structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt', 'taxamount', 'property_tax_per_sqft'] | |
aggregated_feat_creator = CreateAggregatedFeatures(group_col=group_col, agg_cols=agg_cols) | |
cb_preprocessor = Pipeline([ | |
('derived_feat_creator', derived_feat_creator), | |
# ('aggregated_feat_creator', aggregated_feat_creator), | |
('feature_dropper', feature_dropper), | |
('date_feat_creator', date_feat_creator), | |
('year_feat_creator', year_feat_creator), | |
('convert_to_cat', convert_to_cat), | |
]) | |
data_prep_pipe = cb_preprocessor.fit(X_prepared) | |
X_prepared = cb_preprocessor.transform(X_prepared) | |
X_prepared_val = cb_preprocessor.transform(X_prepared_val) | |
X_cb = X_prepared.copy() | |
X_cb_val = X_prepared_val.copy() | |
# Initialize CatBoost Model | |
params = { | |
'n_estimators': 100, | |
'random_state': 42, | |
'eval_metric': 'MAE', | |
} | |
# Fit model using early stopping and validation set | |
fit_params={ | |
'eval_set': (X_prepared_val, y_val), | |
'cat_features': cat_vars, | |
'early_stopping_rounds': 10, | |
'verbose': False, | |
} | |
cb_base = cb.CatBoostRegressor(**params) | |
cb_base.fit(X_prepared, y_train, **fit_params) | |
get_eval_metrics([cb_base], X_prepared_val, y_val) | |
# get_cross_val_scores([cb_base], X_prepared, y_train, cv=3, fit_params=fit_params) | |
"""## CatBoost Hyperparameter Tuning (Using Optuna) | |
Given the better documentation for Optuna compared to ``hyperopt``, I decided to continue using it for CatBoost Hyperparameter tuning. | |
""" | |
def objective(trial, X, y): | |
param_grid = { | |
# Fixed Params | |
"eval_metric": "MAE", | |
"n_estimators": 10000, | |
"random_state": 42, | |
# Tuned Params | |
"learning_rate": trial.suggest_loguniform("learning_rate", 0.05, 0.3), | |
"l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e1), | |
"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1), | |
"depth": trial.suggest_int("depth", 1, 10), | |
"boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]), | |
"bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]), | |
"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20), | |
"one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20), | |
} | |
# Conditional Hyper-Parameters | |
if param_grid["bootstrap_type"] == "Bayesian": | |
param_grid["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10) | |
elif param_grid["bootstrap_type"] == "Bernoulli": | |
param_grid["subsample"] = trial.suggest_float("subsample", 0.1, 1) | |
cv = KFold(n_splits=3, shuffle=True, random_state=42) | |
cv_scores = np.empty(3) | |
for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)): | |
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] | |
y_train, y_test = y[train_idx], y[test_idx] | |
model = cb.CatBoostRegressor(**param_grid) | |
model.fit( | |
X_train, | |
y_train, | |
eval_set=[(X_test, y_test)], | |
cat_features=cat_vars, | |
early_stopping_rounds=20, | |
verbose=False | |
) | |
preds = model.predict(X_test) | |
cv_scores[idx] = mean_absolute_error(y_test, preds) | |
return np.mean(cv_scores) | |
# Similar to Hyperopt - this code creates an experiment using the objective function and parameter grid above | |
study = optuna.create_study(direction="minimize", study_name="CatBoost Regressor") | |
func = lambda trial: objective(trial, X_prepared, y_train.to_numpy()) | |
study.optimize(func, n_trials=100) | |
print(f"\tBest value (mae): {study.best_value:.5f}") | |
print(f"\tBest params:") | |
for key, value in study.best_params.items(): | |
print(f"\t\t{key}: {value}") | |
"""### Best Params Model | |
Based on the 400 ``Optuna`` trials, the following hyperparameter values had the best loss of MAE: 0.05265 | |
``` | |
best_params = { | |
'learning_rate': 0.01, | |
'l2_leaf_reg': 3, | |
'depth': 6, | |
'n_estimators': 900, | |
'random_state': 42, | |
'eval_metric': 'MAE', | |
'loss_function': 'MAE' | |
} | |
``` | |
""" | |
best_params = { | |
'learning_rate': 0.03, | |
'l2_leaf_reg': 3, | |
'depth': 6, | |
'n_estimators': 900, | |
'random_state': 42, | |
'eval_metric': "MAE", | |
'loss_function': 'MAE', | |
} | |
fit_params={ | |
'cat_features': cat_vars, | |
'verbose': False, | |
} | |
cb_base = cb.CatBoostRegressor(**best_params) | |
cb_base.fit(X_prepared, y_train, **fit_params) | |
get_eval_metrics([cb_base], X_prepared_val, y_val) | |
# get_cross_val_scores([cb_base], X_prepared, y_train, cv=3, fit_params=fit_params) | |
cb_importances = cb_base.get_feature_importance() | |
cb_importances = pd.DataFrame(sorted(zip(cb_importances, X_cb.columns)), columns=['Value','Feature']) | |
# Select 15 most important features | |
cb_importances = cb_importances.sort_values(by='Value', ascending=False)[:20] | |
# Plot importances | |
plt.figure(figsize=(10, 10)) | |
sns.barplot(x="Value", y="Feature", data=cb_importances.sort_values(by="Value", ascending=False)) | |
plt.title('CatBoost Global Feature Importances') | |
plt.tight_layout() | |
plt.show() | |
"""Analyzing the plot above shows how the location related features (``propertycountylandusecode``, ``regionidzip``) are some of the most important followed by the age of house and ``calculatedfinishedsquarefeet``. """ | |
!pip install ml_insights | |
# !pip install ml_insights #if you don't have it installed | |
import ml_insights as mli | |
range_pts = mli.get_range_dict(X_cb) | |
test_pts = X_cb_val.sample(5, random_state=40) | |
imp_num_feat = [ | |
'calculatedfinishedsquarefeet', 'house_age', | |
'taxamount', 'structuretaxvaluedollarcnt', 'latitude', | |
] | |
"""The ranges of values used for x-axis need to be updated to remove outliers to better interpret the ICE-plot.""" | |
# Updating the ranges of features on x-axis for better interpretation | |
range_pts['calculatedfinishedsquarefeet'] = np.linspace(0, 6000, 200) | |
range_pts['house_age'] = np.linspace(0, 130, 65) | |
range_pts['taxamount'] = np.linspace(0, 25000, 200) | |
range_pts['structuretaxvaluedollarcnt'] = np.linspace(0, 300000, 200) | |
range_pts['latitude'] = np.linspace(3.35e+07, 3.46e+07, 200) | |
mli.ice_plot(cb_base, test_pts, imp_num_feat, range_pts=range_pts, pred_fn='predict', figsize=(15,10)) | |
!pip install shap | |
# !pip install shap #if you don't have it installed | |
import shap | |
shap.initjs() | |
explainer = shap.Explainer(cb_base) | |
shap_values = explainer(X_cb) | |
# The SHAP expected value is the median of the target variable | |
explainer.expected_value, np.mean(y_train) | |
!pip install ipython | |
from IPython.core.display import display, HTML | |
shap.initjs() | |
y_train1 = y_train.reset_index(drop=True) | |
# Now you can calculate the percentiles without the offset | |
i_med = np.argsort(y_train1)[len(y_train1)//2] | |
i_71 = np.argsort(y_train1)[int(len(y_train1) * 0.75)] | |
i_20 = np.argsort(y_train1)[int(len(y_train1) * 0.20)] | |
shap.plots.waterfall(shap_values[i_71], max_display=15) | |
#shap_html = shap.force_plot(explainer.expected_value, shap_values[i_med].values, matplotlib=True) | |
#shap_html.savefig(course_path + 'shap_force_plot.png') | |
shap.force_plot(explainer.expected_value, shap_values[i_med].values, matplotlib=True) | |
plt.savefig(course_path +'shap_force_plot.png') | |
y_train1 = y_train.reset_index(drop=True) | |
# Now you can calculate the percentiles without the offset | |
i_med = np.argsort(y_train1)[len(y_train1)//2] | |
i_71 = np.argsort(y_train1)[int(len(y_train1) * 0.75)] | |
i_20 = np.argsort(y_train1)[int(len(y_train1) * 0.20)] | |
"""### Force Plots | |
Force plots are equivalent representations as Waterfall plots that display the key information is a more condensed format. | |
""" | |
# Force plot for 20th percentile y_train value | |
shap.force_plot(explainer.expected_value, shap_values[i_20].values, matplotlib=True) | |
plt.savefig(course_path +'shap_force_plot20.png') | |
"""## SHAP for Global Interpretability | |
Thanks to its versatility, SHAP values can also be used to get a sense of how "globally" important a feature is by aggregating the (absolute value of) the local feature values. These numbers can be thought of as the "average absolute impact" that a variable has on the prediction. | |
""" | |
shap.plots.bar(shap_values, max_display=20) | |
def fit_rf(X_prepared, y_train, random_state=42): | |
best_params = { | |
'n_estimators': 400, | |
'max_samples': 0.3, | |
'max_depth': 70, | |
'max_features': 0.6, | |
'min_samples_leaf': 32, | |
'random_state': random_state, | |
} | |
ran_forest = RandomForestRegressor(**best_params) | |
ran_forest.fit(X_prepared, y_train) | |
return ran_forest | |
def fit_xgb(X_prepared, y_train, random_state=42): | |
fit_params={ | |
'early_stopping_rounds': 15, | |
'eval_metric': 'mae', | |
'verbose': False, | |
'eval_set': [[X_xgb_val, y_val]], | |
} | |
best_params = { | |
'colsample_bynode': 0.75, | |
'gamma': 0.201, | |
'max_depth': 5, | |
'subsample': 0.95, | |
'tree_method': 'exact', | |
'learning_rate': 0.01, | |
'n_estimators': 10000, | |
'random_state': random_state, | |
} | |
xgb_base = xgb.XGBRegressor(**best_params) | |
xgb_base.fit(X_prepared, y_train, **fit_params) | |
return xgb_base | |
def fit_lgbm(X_prepared, y_train, random_state=42): | |
fit_params={ | |
'early_stopping_rounds': 30, | |
'eval_metric': 'mae', | |
'verbose': False, | |
'eval_set': [[X_lgbm_val, y_val]] | |
} | |
best_params = { | |
'num_leaves': 512, | |
'max_depth': 9, | |
'lambda_l2': 15, | |
'bagging_fraction': 0.9, | |
'feature_fraction': 0.6, | |
'learning_rate': 0.01, | |
'n_estimators': 10000, | |
'random_state': random_state, | |
} | |
lgbm_base = lgbm.LGBMRegressor(**best_params) | |
lgbm_base.fit(X_prepared, y_train, **fit_params) | |
return lgbm_base | |
def fit_catboost(X_prepared, y_train, random_state=42): | |
best_params = { | |
'learning_rate': 0.03, | |
'l2_leaf_reg': 3, | |
'depth': 6, | |
'n_estimators': 900, | |
'random_state': random_state, | |
'eval_metric': "MAE", | |
'loss_function': 'MAE', | |
} | |
fit_params={ | |
'cat_features': cat_vars, | |
'verbose': False, | |
} | |
cb_base = cb.CatBoostRegressor(**best_params) | |
cb_base.fit(X_prepared, y_train, **fit_params) | |
return cb_base | |
def get_out_of_fold_preds(X_prepared, y_train, model_fcn, random_state=42): | |
""" | |
Use K-Fold Cross Validation to create out-of-fold predictions for supplied model. | |
The out-of-fold predictions can then be used to train a Meta Learner to further | |
improve the model's performance. | |
""" | |
model_preds = list() | |
meta_y = list() | |
kfold = KFold(n_splits=3, shuffle=True, random_state=42) | |
for train_ix, test_ix in kfold.split(X_prepared): | |
# Depending on whether X_prepared is a dataframe or numpy array, use appropriate indexing | |
if isinstance(X_prepared, pd.DataFrame): | |
train_X, test_X = X_prepared.iloc[train_ix], X_prepared.iloc[test_ix] | |
else: | |
train_X, test_X = X_prepared[train_ix], X_prepared[test_ix] | |
train_y, test_y = y_train.iloc[train_ix], y_train.iloc[test_ix] | |
model = model_fcn(train_X, train_y, random_state=random_state) | |
preds = model.predict(test_X) | |
meta_y.extend(test_y) | |
model_preds.extend(preds) | |
return model_preds, meta_y | |
""" | |
from sklearn.model_selection import KFold | |
def get_out_of_fold_preds(X_prepared, y_train, model_fcn, random_state=42): | |
Use K-Fold Cross Validation to create out-of-fold predictions for supplied model. | |
The out-of-fold predictions can then be used to train a Meta Learner to further | |
improve the model's performance. | |
model_preds = list() | |
meta_y = list() | |
kfold = KFold(n_splits=3, shuffle=True, random_state=42) | |
for train_ix, test_ix in kfold.split(X_prepared): | |
# Depending on whether X_prepared is a dataframe or numpy array, uncomment one line below | |
# train_X, test_X = X_prepared.iloc[train_ix], X_prepared.iloc[test_ix] | |
train_X, test_X = X_prepared[train_ix], X_prepared[test_ix] | |
train_y, test_y = y_train.iloc[train_ix], y_train.iloc[test_ix] | |
model = model_fcn(train_X, train_y, random_state=random_state) | |
preds = model.predict(test_X) | |
meta_y.extend(test_y) | |
model_preds.extend(preds) | |
return model_preds, meta_y | |
""" | |
# Get Out-of-fold predictions for each base learner | |
# WARNING: This cell takes multiple hours to run (remove Random Forest regressor to save time which takes significantly longer than others) | |
xgb_preds, meta_y = get_out_of_fold_preds(X_xgb, y_train, fit_xgb) | |
lgbm_preds, meta_y = get_out_of_fold_preds(X_lgbm, y_train, fit_lgbm) | |
rf_preds, meta_y = get_out_of_fold_preds(X_rf, y_train, fit_rf) | |
X_cb_reset = X_cb.reset_index(drop=True) | |
y_train_reset = y_train.reset_index(drop=True) | |
cb_preds, meta_y = get_out_of_fold_preds(X_cb_reset, y_train_reset, fit_catboost) | |
# Create meta_X training set by combining out-of-fold predictions from individual learners | |
yhat1 = array(rf_preds).reshape((len(rf_preds), 1)) | |
yhat2 = array(xgb_preds).reshape((len(xgb_preds), 1)) | |
yhat3 = array(lgbm_preds).reshape((len(lgbm_preds), 1)) | |
yhat4 = array(cb_preds).reshape((len(cb_preds), 1)) | |
meta_X = hstack((yhat1, yhat2, yhat3, yhat4)) | |
meta_X | |
# Create validation datset for testing Meta Learner | |
# Make predictions on original validation sets using originally trained base learners | |
xgb_val_preds = xgb_base.predict(X_xgb_val) | |
lgbm_val_preds = lgbm_base.predict(X_lgbm_val) | |
cb_val_preds = cb_base.predict(X_cb_val) | |
rf_val_preds = ran_forest.predict(X_rf_val) | |
# Create validation dataset | |
yhat1 = array(rf_val_preds).reshape((len(rf_val_preds), 1)) | |
yhat2 = array(xgb_val_preds).reshape((len(xgb_val_preds), 1)) | |
yhat3 = array(lgbm_val_preds).reshape((len(lgbm_val_preds), 1)) | |
yhat4 = array(cb_val_preds).reshape((len(cb_val_preds), 1)) | |
meta_X_val = hstack((yhat1, yhat2, yhat3, yhat4)) | |
meta_X_val | |
meta_X.shape, meta_X_val.shape | |
"""## Meta Learners | |
Train a Meta Learner using training set created from out-of-fold predictions of individual learners | |
""" | |
# Initialize LightGBM Model | |
best_params = { | |
'num_leaves': 128, | |
'max_depth': 6, | |
'lambda_l2': 15, | |
'bagging_fraction': 0.6, | |
'learning_rate': 0.01, | |
'n_estimators': 10000, | |
'random_state': 42, | |
} | |
# Fit model using early stopping and validation set | |
fit_params={'early_stopping_rounds': 30, | |
'eval_metric': 'mae', | |
'verbose': False, | |
'eval_set': [[meta_X_val, y_val]] | |
} | |
lgbm_stacked = lgbm.LGBMRegressor(**best_params) | |
lgbm_stacked.fit(meta_X, meta_y, **fit_params) | |
# CatBoost Meta Learner | |
best_params = { | |
'learning_rate': 0.01, | |
'l2_leaf_reg': 15, | |
'depth': 6, | |
'min_data_in_leaf': 32, | |
'subsample': 0.6, | |
'n_estimators': 10000, | |
'random_state': 42, | |
'eval_metric': "MAE", | |
} | |
fit_params={ | |
'eval_set': (meta_X_val, y_val), | |
'early_stopping_rounds': 30, | |
'verbose': False, | |
} | |
cb_stacked = cb.CatBoostRegressor(**best_params) | |
cb_stacked.fit(meta_X, meta_y, **fit_params) | |
# Meta Model MAE on Validation Set | |
y_preds = lgbm_stacked.predict(meta_X_val) | |
# y_preds = cb_stacked.predict(meta_X_val) | |
mae = mean_absolute_error(y_val, y_preds) | |
rmse = mean_squared_error(y_val, y_preds, squared=False) | |
print(f"Stacked Model Evaluation") | |
print(f"MAE: {mae}, RMSE: {rmse}") | |
def create_model_preds(all_models, X): | |
""" | |
Combine model predictions from different models using the input weights. | |
all_models: List[Tuple(model, preprocessor)] | |
List of a tuple of model info where each tuple has fitted model and data preprocessor | |
weights: List[] | |
Weights for each model in all_models (must be same length as all_models) | |
X: | |
Data to predict on | |
""" | |
combined_preds = [] | |
for model_info in all_models: | |
model, preprocessor = model_info | |
print(f"Starting preds for {model}") | |
# Note: function below is defined in later section (Submission CSV Generation) | |
model_pred = predict_in_chunks(model, preprocessor, X) | |
combined_preds.append(model_pred) | |
return combined_preds | |
def combine_model_preds(combined_preds, weights, X): | |
# Combine based on weights | |
final_pred = np.empty(len(X)) | |
print(f"Combining with weights: {weights}") | |
for idx, weight in enumerate(weights): | |
final_pred += combined_preds[idx] * weight | |
return final_pred | |
def chunker(seq, size): | |
"""Function to iterate over dataframe in chunks.""" | |
return (seq[pos:pos + size] for pos in range(0, len(seq), size)) | |
def predict_in_chunks(model, preprocessor, df_test): | |
final_pred = [] | |
for group in chunker(df_test, 300000): | |
group = preprocessor.transform(group) | |
pred = model.predict(group) | |
final_pred.append(pred) | |
del group; gc.collect() | |
del pred; gc.collect() | |
return np.concatenate(final_pred).ravel() | |
print('Loading Properties ...') | |
#properties_2016 = pd.read_csv('../input/zillow-prize-1/properties_2016.csv', low_memory = False) | |
properties_2016 = pd.read_csv(course_path+'properties_2016.csv', low_memory = False) | |
print('Loading Sample ...') | |
#sample_submission = pd.read_csv('../input/zillow-prize-1/sample_submission.csv', low_memory = False) | |
sample_submission = pd.read_csv(course_path+ 'sample_submission.csv', low_memory = False) | |
sample_submission['parcelid'] = sample_submission['ParcelId'] | |
print('Concat Train 2016 & 2017 ...') | |
df_test = pd.merge(sample_submission[['parcelid']], properties_2016, how = 'left', on = 'parcelid') | |
df_test['transactiondate'] = pd.Timestamp('2016-12-01') | |
df_test.shape | |
# Models to generate predictions for | |
all_models = [ | |
(ran_forest, rf_preprocessor), | |
(xgb_base, xgb_preprocessor), | |
(lgbm_base, lgbm_preprocessor), | |
(cb_base, cb_preprocessor), | |
] | |
# Creating Predictions for each model | |
combined_preds = create_model_preds(all_models, df_test) # df_test is defined in next section (Submission CSV Generation) | |
# Create Meta_X from base learner predictions | |
yhat1 = combined_preds[0].reshape((len(df_test), 1)) | |
yhat2 = combined_preds[1].reshape((len(df_test), 1)) | |
yhat3 = combined_preds[2].reshape((len(df_test), 1)) | |
yhat4 = combined_preds[3].reshape((len(df_test), 1)) | |
meta_X_submission = hstack((yhat1, yhat2, yhat3, yhat4)) | |
meta_X_submission.shape | |
# Create predictions from Meta Learners using outputs from individual learners | |
lgbm_stacked_pred = lgbm_stacked.predict(meta_X_submission) | |
cb_stacked_pred = cb_stacked.predict(meta_X_submission) | |
meta_combined_preds = [lgbm_stacked_pred, cb_stacked_pred] | |
"""#### **The final model yielded in a Public Score of 0.06416 and a private score of 0.07497 which are within Top 250 and Top 60 respectively in the world out of over 3700 submissions.** """ | |
# Models = [RF, XGB, LGBM, CB] | |
weights = [0.0, 0.0, 0.20, 0.55] | |
final_pred = combine_model_preds(combined_preds, weights, df_test) | |
# Models: [Stacked_LGBM, Stacked_CB] | |
weights = [0.25, 0.00] | |
print(f"Combining with weights: {weights}") | |
for idx, weight in enumerate(weights): | |
final_pred += meta_combined_preds[idx] * weight | |
# Create final submission CSV using final test predictions | |
#sub = pd.read_csv('../input/zillow-prize-1/sample_submission.csv') | |
sub = pd.read_csv(course_path+'sample_submission.csv') | |
for c in sub.columns[sub.columns != 'ParcelId']: | |
sub[c] = final_pred | |
print('Writing csv ...') | |
#sub.to_csv('_9_Combined_plus_Stacked.csv', index=False, float_format='%.4f') | |
sub.to_csv(course_path+'_9_Combined_plus_Stacked.csv', index=False, float_format='%.4f') | |
"""# Submission CSV Generation | |
The following code runs a chosen model along with the Scikit-Learn Pipeline preprocessor to generate the final submission CSV using the Kaggle Test data. | |
""" | |
print('Loading Properties ...') | |
#properties_2016 = pd.read_csv('../input/zillow-prize-1/properties_2016.csv', low_memory = False) | |
properties_2016 = pd.read_csv(course_path+'properties_2016.csv', low_memory = False) | |
print('Loading Sample ...') | |
#sample_submission = pd.read_csv('../input/zillow-prize-1/sample_submission.csv', low_memory = False) | |
sample_submission = pd.read_csv(course_path+ 'sample_submission.csv', low_memory = False) | |
sample_submission['parcelid'] = sample_submission['ParcelId'] | |
print('Concat Train 2016 & 2017 ...') | |
df_test = pd.merge(sample_submission[['parcelid']], properties_2016, how = 'left', on = 'parcelid') | |
df_test['transactiondate'] = pd.Timestamp('2016-12-01') | |
df_test.shape | |
del properties_2016, sample_submission; gc.collect() | |
def chunker(seq, size): | |
"""Function to iterate over dataframe in chunks.""" | |
return (seq[pos:pos + size] for pos in range(0, len(seq), size)) | |
def predict_in_chunks(model, preprocessor, df_test): | |
final_pred = [] | |
for group in chunker(df_test, 300000): | |
group = preprocessor.transform(group) | |
pred = model.predict(group) | |
final_pred.append(pred) | |
del group; gc.collect() | |
del pred; gc.collect() | |
return np.concatenate(final_pred).ravel() | |
# Run the chosen model to generate predictions | |
model = cb_base | |
final_pred = predict_in_chunks(model, cb_preprocessor, df_test) | |
len(final_pred) | |
# Create final submission CSV using final test predictions | |
#sub = pd.read_csv('../input/zillow-prize-1/sample_submission.csv') | |
sub = pd.read_csv(course_path+ 'sample_submission.csv') | |
for c in sub.columns[sub.columns != 'ParcelId']: | |
sub[c] = final_pred | |
print('Writing csv ...') | |
#sub.to_csv('2_CatBoost_smaller_feat_set_best_params.csv', index=False, float_format='%.4f') | |
sub.to_csv(course_path+'2_CatBoost_smaller_feat_set_best_params.csv', index=False, float_format='%.4f') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment