This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Replace all occurrences of Not Available with numpy not a number | |
data = data.replace({'Not Available': np.nan}) | |
# Iterate through the columns | |
for col in list(data.columns): | |
# Select columns that should be numeric | |
if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in | |
col or 'therms' in col or 'gal' in col or 'Score' in col): | |
# Convert the data type to float | |
data[col] = data[col].astype(float) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create a list of buildings with more than 100 measurements | |
types = data.dropna(subset=['score']) | |
types = types['Largest Property Use Type'].value_counts() | |
types = list(types[types.values > 100].index) | |
# Plot of distribution of scores for building categories | |
figsize(12, 10) | |
# Plot each building | |
for b_type in types: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copy the original data | |
features = data.copy() | |
# Select the numeric columns | |
numeric_subset = data.select_dtypes('number') | |
# Create columns with log of numeric columns | |
for col in numeric_subset.columns: | |
# Skip the Energy Star Score column | |
if col == 'score': |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Function to calculate mean absolute error | |
def mae(y_true, y_pred): | |
return np.mean(abs(y_true - y_pred)) | |
baseline_guess = np.median(y) | |
print('The baseline guess is a score of %0.2f' % baseline_guess) | |
print("Baseline Performance on the test set: MAE = %0.4f" % mae(y_test, baseline_guess)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extract the columns to plot | |
plot_data = features[['score', 'Site EUI (kBtu/ft²)', | |
'Weather Normalized Source EUI (kBtu/ft²)', | |
'log_Total GHG Emissions (Metric Tons CO2e)']] | |
# Replace the inf with nan | |
plot_data = plot_data.replace({np.inf: np.nan, -np.inf: np.nan}) | |
# Rename columns | |
plot_data = plot_data.rename(columns = {'Site EUI (kBtu/ft²)': 'Site EUI', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.ensemble import GradientBoostingRegressor | |
# Create the model | |
gradient_boosted = GradientBoostingRegressor() | |
# Fit the model on the training data | |
gradient_boosted.fit(X, y) | |
# Make predictions on the test data | |
predictions = gradient_boosted.predict(X_test) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Loss function to be optimized | |
loss = ['ls', 'lad', 'huber'] | |
# Number of trees used in the boosting process | |
n_estimators = [100, 500, 900, 1100, 1500] | |
# Maximum depth of each tree | |
max_depth = [2, 3, 5, 10, 15] | |
# Minimum number of samples per leaf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.ensemble import GradientBoostingRegressor | |
# Create the model with the best hyperparamters | |
model_reduced = GradientBoostingRegressor(loss='lad', max_depth=5, max_features=None, | |
min_samples_leaf=6, min_samples_split=6, | |
n_estimators=800, random_state=42) | |
# Fit and test on the reduced set of features | |
model_reduced.fit(X_reduced, y) | |
model_reduced_pred = model_reduced.predict(X_test_reduced) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# model is the trained model | |
importances = model.feature_importances_ | |
# train_features is the dataframe of training features | |
feature_list = list(train_features.columns) | |
# Extract the feature importances into a dataframe | |
feature_results = pd.DataFrame({'feature': feature_list, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn import tree | |
# Extract a single tree (number 105) | |
single_tree = model.estimators_[105][0] | |
# Save the tree to a dot file | |
tree.export_graphviz(single_tree, out_file = 'images/tree.dot', | |
feature_names = feature_list) |