This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Examines the effect of changing a single variable | |
# Takes in the name of the variable, the trace, and the data | |
def model_effect(query_var, trace, X): | |
# Variables that do not change | |
steady_vars = list(X.columns) | |
steady_vars.remove(query_var) | |
# Linear Model that estimates a grade based on the value of the query variable | |
# and one sample from the trace |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# X_train is our training data, we will make a copy for plotting | |
X_plot = X_train.copy() | |
# Compare grades to the median | |
X_plot['relation_median'] = (X_plot['Grade'] >= 12) | |
X_plot['Grade'] = X_plot['Grade'].replace({True: 'above', | |
False: 'below'}) | |
# Plot all variables in a loop | |
plt.figure(figsize=(12, 12)) | |
for i, col in enumerate(X_plot.columns[:-1]): | |
plt.subplot(3, 2, i + 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Replace the string for missing values with not a number | |
data = data.replace({'Not Available': np.nan}) | |
# Iterate through the columns | |
for col in list(data.columns): | |
# Select columns that should be numeric using string matching | |
if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in | |
col or 'therms' in col or 'gal' in col or 'Score' in col): | |
# Convert the data type to float | |
data[col] = data[col].astype(float) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Replace all occurrences of Not Available with numpy not a number | |
data = data.replace({'Not Available': np.nan}) | |
# Iterate through the columns | |
for col in list(data.columns): | |
# Select columns that should be numeric | |
if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in | |
col or 'therms' in col or 'gal' in col or 'Score' in col): | |
# Convert the data type to float | |
data[col] = data[col].astype(float) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create a list of buildings with more than 100 measurements | |
types = data.dropna(subset=['score']) | |
types = types['Largest Property Use Type'].value_counts() | |
types = list(types[types.values > 100].index) | |
# Plot of distribution of scores for building categories | |
figsize(12, 10) | |
# Plot each building | |
for b_type in types: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copy the original data | |
features = data.copy() | |
# Select the numeric columns | |
numeric_subset = data.select_dtypes('number') | |
# Create columns with log of numeric columns | |
for col in numeric_subset.columns: | |
# Skip the Energy Star Score column | |
if col == 'score': |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Function to calculate mean absolute error | |
def mae(y_true, y_pred): | |
return np.mean(abs(y_true - y_pred)) | |
baseline_guess = np.median(y) | |
print('The baseline guess is a score of %0.2f' % baseline_guess) | |
print("Baseline Performance on the test set: MAE = %0.4f" % mae(y_test, baseline_guess)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extract the columns to plot | |
plot_data = features[['score', 'Site EUI (kBtu/ft²)', | |
'Weather Normalized Source EUI (kBtu/ft²)', | |
'log_Total GHG Emissions (Metric Tons CO2e)']] | |
# Replace the inf with nan | |
plot_data = plot_data.replace({np.inf: np.nan, -np.inf: np.nan}) | |
# Rename columns | |
plot_data = plot_data.rename(columns = {'Site EUI (kBtu/ft²)': 'Site EUI', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.ensemble import GradientBoostingRegressor | |
# Create the model | |
gradient_boosted = GradientBoostingRegressor() | |
# Fit the model on the training data | |
gradient_boosted.fit(X, y) | |
# Make predictions on the test data | |
predictions = gradient_boosted.predict(X_test) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Loss function to be optimized | |
loss = ['ls', 'lad', 'huber'] | |
# Number of trees used in the boosting process | |
n_estimators = [100, 500, 900, 1100, 1500] | |
# Maximum depth of each tree | |
max_depth = [2, 3, 5, 10, 15] | |
# Minimum number of samples per leaf |