Will Koehrsen WillKoehrsen

## data_conversion.py
# Replace all occurrences of Not Available with numpy not a number
data = data.replace({'Not Available': np.nan})

# Iterate through the columns
for col in list(data.columns):
    # Select columns that should be numeric
    if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in
        col or 'therms' in col or 'gal' in col or 'Score' in col):
        # Convert the data type to float
        data[col] = data[col].astype(float)

## density_plot_building_type.py
# Create a list of buildings with more than 100 measurements
types = data.dropna(subset=['score'])
types = types['Largest Property Use Type'].value_counts()
types = list(types[types.values > 100].index)

# Plot of distribution of scores for building categories
figsize(12, 10)

# Plot each building
for b_type in types:

## feature_engineering.py
# Copy the original data
features = data.copy()

# Select the numeric columns
numeric_subset = data.select_dtypes('number')

# Create columns with log of numeric columns
for col in numeric_subset.columns:
    # Skip the Energy Star Score column
    if col == 'score':

## baseline.py
# Function to calculate mean absolute error
def mae(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

baseline_guess = np.median(y)

print('The baseline guess is a score of %0.2f' % baseline_guess)
print("Baseline Performance on the test set: MAE = %0.4f" % mae(y_test, baseline_guess))

## feature_pairs_plot.py
# Extract the columns to  plot
plot_data = features[['score', 'Site EUI (kBtu/ft²)',
                      'Weather Normalized Source EUI (kBtu/ft²)',
                      'log_Total GHG Emissions (Metric Tons CO2e)']]

# Replace the inf with nan
plot_data = plot_data.replace({np.inf: np.nan, -np.inf: np.nan})

# Rename columns
plot_data = plot_data.rename(columns = {'Site EUI (kBtu/ft²)': 'Site EUI',

## gradient_boosted_model.py
from sklearn.ensemble import GradientBoostingRegressor

# Create the model
gradient_boosted = GradientBoostingRegressor()

# Fit the model on the training data
gradient_boosted.fit(X, y)

# Make predictions on the test data
predictions = gradient_boosted.predict(X_test)

## gradient_boosted_hyperparameter_tuning.py
# Loss function to be optimized
loss = ['ls', 'lad', 'huber']

# Number of trees used in the boosting process
n_estimators = [100, 500, 900, 1100, 1500]

# Maximum depth of each tree
max_depth = [2, 3, 5, 10, 15]

# Minimum number of samples per leaf

## gradient_boosted_reduced_model.py
from sklearn.ensemble import GradientBoostingRegressor

# Create the model with the best hyperparamters
model_reduced = GradientBoostingRegressor(loss='lad', max_depth=5, max_features=None,
                                  min_samples_leaf=6, min_samples_split=6,
                                  n_estimators=800, random_state=42)

# Fit and test on the reduced set of features
model_reduced.fit(X_reduced, y)
model_reduced_pred = model_reduced.predict(X_test_reduced)

## feature_importance_extraction.py
import pandas as pd

# model is the trained model
importances = model.feature_importances_

# train_features is the dataframe of training features
feature_list = list(train_features.columns)

# Extract the feature importances into a dataframe
feature_results = pd.DataFrame({'feature': feature_list,

## visualize_decision_tree.py
from sklearn import tree

# Extract a single tree (number 105)
single_tree = model.estimators_[105][0]

# Save the tree to a dot file
tree.export_graphviz(single_tree, out_file = 'images/tree.dot',
                     feature_names = feature_list)
	# Replace all occurrences of Not Available with numpy not a number
	data = data.replace({'Not Available': np.nan})

	# Iterate through the columns
	for col in list(data.columns):
	# Select columns that should be numeric
	if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in
	col or 'therms' in col or 'gal' in col or 'Score' in col):
	# Convert the data type to float
	data[col] = data[col].astype(float)
	# Create a list of buildings with more than 100 measurements
	types = data.dropna(subset=['score'])
	types = types['Largest Property Use Type'].value_counts()
	types = list(types[types.values > 100].index)

	# Plot of distribution of scores for building categories
	figsize(12, 10)

	# Plot each building
	for b_type in types:
	# Copy the original data
	features = data.copy()

	# Select the numeric columns
	numeric_subset = data.select_dtypes('number')

	# Create columns with log of numeric columns
	for col in numeric_subset.columns:
	# Skip the Energy Star Score column
	if col == 'score':
	# Function to calculate mean absolute error
	def mae(y_true, y_pred):
	return np.mean(abs(y_true - y_pred))

	baseline_guess = np.median(y)

	print('The baseline guess is a score of %0.2f' % baseline_guess)
	print("Baseline Performance on the test set: MAE = %0.4f" % mae(y_test, baseline_guess))
	# Extract the columns to plot
	plot_data = features[['score', 'Site EUI (kBtu/ft²)',
	'Weather Normalized Source EUI (kBtu/ft²)',
	'log_Total GHG Emissions (Metric Tons CO2e)']]

	# Replace the inf with nan
	plot_data = plot_data.replace({np.inf: np.nan, -np.inf: np.nan})

	# Rename columns
	plot_data = plot_data.rename(columns = {'Site EUI (kBtu/ft²)': 'Site EUI',
	from sklearn.ensemble import GradientBoostingRegressor

	# Create the model
	gradient_boosted = GradientBoostingRegressor()

	# Fit the model on the training data
	gradient_boosted.fit(X, y)

	# Make predictions on the test data
	predictions = gradient_boosted.predict(X_test)
	# Loss function to be optimized
	loss = ['ls', 'lad', 'huber']

	# Number of trees used in the boosting process
	n_estimators = [100, 500, 900, 1100, 1500]

	# Maximum depth of each tree
	max_depth = [2, 3, 5, 10, 15]

	# Minimum number of samples per leaf
	from sklearn.ensemble import GradientBoostingRegressor

	# Create the model with the best hyperparamters
	model_reduced = GradientBoostingRegressor(loss='lad', max_depth=5, max_features=None,
	min_samples_leaf=6, min_samples_split=6,
	n_estimators=800, random_state=42)

	# Fit and test on the reduced set of features
	model_reduced.fit(X_reduced, y)
	model_reduced_pred = model_reduced.predict(X_test_reduced)
	import pandas as pd

	# model is the trained model
	importances = model.feature_importances_

	# train_features is the dataframe of training features
	feature_list = list(train_features.columns)

	# Extract the feature importances into a dataframe
	feature_results = pd.DataFrame({'feature': feature_list,
	from sklearn import tree

	# Extract a single tree (number 105)
	single_tree = model.estimators_[105][0]

	# Save the tree to a dot file
	tree.export_graphviz(single_tree, out_file = 'images/tree.dot',
	feature_names = feature_list)