Will Koehrsen WillKoehrsen

## query_vars.py
# Examines the effect of changing a single variable
# Takes in the name of the variable, the trace, and the data
def model_effect(query_var, trace, X):

    # Variables that do not change
    steady_vars = list(X.columns)
    steady_vars.remove(query_var)

    # Linear Model that estimates a grade based on the value of the query variable
    # and one sample from the trace

## features_dist.py
# X_train is our training data, we will make a copy for plotting
X_plot = X_train.copy()
# Compare grades to the median
X_plot['relation_median'] = (X_plot['Grade'] >= 12)
X_plot['Grade'] = X_plot['Grade'].replace({True: 'above',
                                          False: 'below'})
# Plot all variables in a loop
plt.figure(figsize=(12, 12))
for i, col in enumerate(X_plot.columns[:-1]):
    plt.subplot(3, 2, i + 1)

## column_conversion.py
# Replace the string for missing values with not a number
data = data.replace({'Not Available': np.nan})

# Iterate through the columns
for col in list(data.columns):
    # Select columns that should be numeric using string matching
    if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in
        col or 'therms' in col or 'gal' in col or 'Score' in col):
        # Convert the data type to float
        data[col] = data[col].astype(float)

## data_conversion.py
# Replace all occurrences of Not Available with numpy not a number
data = data.replace({'Not Available': np.nan})

# Iterate through the columns
for col in list(data.columns):
    # Select columns that should be numeric
    if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in
        col or 'therms' in col or 'gal' in col or 'Score' in col):
        # Convert the data type to float
        data[col] = data[col].astype(float)

## density_plot_building_type.py
# Create a list of buildings with more than 100 measurements
types = data.dropna(subset=['score'])
types = types['Largest Property Use Type'].value_counts()
types = list(types[types.values > 100].index)

# Plot of distribution of scores for building categories
figsize(12, 10)

# Plot each building
for b_type in types:

## feature_engineering.py
# Copy the original data
features = data.copy()

# Select the numeric columns
numeric_subset = data.select_dtypes('number')

# Create columns with log of numeric columns
for col in numeric_subset.columns:
    # Skip the Energy Star Score column
    if col == 'score':

## baseline.py
# Function to calculate mean absolute error
def mae(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

baseline_guess = np.median(y)

print('The baseline guess is a score of %0.2f' % baseline_guess)
print("Baseline Performance on the test set: MAE = %0.4f" % mae(y_test, baseline_guess))

## feature_pairs_plot.py
# Extract the columns to  plot
plot_data = features[['score', 'Site EUI (kBtu/ft²)',
                      'Weather Normalized Source EUI (kBtu/ft²)',
                      'log_Total GHG Emissions (Metric Tons CO2e)']]

# Replace the inf with nan
plot_data = plot_data.replace({np.inf: np.nan, -np.inf: np.nan})

# Rename columns
plot_data = plot_data.rename(columns = {'Site EUI (kBtu/ft²)': 'Site EUI',

## gradient_boosted_model.py
from sklearn.ensemble import GradientBoostingRegressor

# Create the model
gradient_boosted = GradientBoostingRegressor()

# Fit the model on the training data
gradient_boosted.fit(X, y)

# Make predictions on the test data
predictions = gradient_boosted.predict(X_test)

## gradient_boosted_hyperparameter_tuning.py
# Loss function to be optimized
loss = ['ls', 'lad', 'huber']

# Number of trees used in the boosting process
n_estimators = [100, 500, 900, 1100, 1500]

# Maximum depth of each tree
max_depth = [2, 3, 5, 10, 15]

# Minimum number of samples per leaf
	# Examines the effect of changing a single variable
	# Takes in the name of the variable, the trace, and the data
	def model_effect(query_var, trace, X):

	# Variables that do not change
	steady_vars = list(X.columns)
	steady_vars.remove(query_var)

	# Linear Model that estimates a grade based on the value of the query variable
	# and one sample from the trace
	# X_train is our training data, we will make a copy for plotting
	X_plot = X_train.copy()
	# Compare grades to the median
	X_plot['relation_median'] = (X_plot['Grade'] >= 12)
	X_plot['Grade'] = X_plot['Grade'].replace({True: 'above',
	False: 'below'})
	# Plot all variables in a loop
	plt.figure(figsize=(12, 12))
	for i, col in enumerate(X_plot.columns[:-1]):
	plt.subplot(3, 2, i + 1)
	# Replace the string for missing values with not a number
	data = data.replace({'Not Available': np.nan})

	# Iterate through the columns
	for col in list(data.columns):
	# Select columns that should be numeric using string matching
	if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in
	col or 'therms' in col or 'gal' in col or 'Score' in col):
	# Convert the data type to float
	data[col] = data[col].astype(float)
	# Replace all occurrences of Not Available with numpy not a number
	data = data.replace({'Not Available': np.nan})

	# Iterate through the columns
	for col in list(data.columns):
	# Select columns that should be numeric
	if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in
	col or 'therms' in col or 'gal' in col or 'Score' in col):
	# Convert the data type to float
	data[col] = data[col].astype(float)
	# Create a list of buildings with more than 100 measurements
	types = data.dropna(subset=['score'])
	types = types['Largest Property Use Type'].value_counts()
	types = list(types[types.values > 100].index)

	# Plot of distribution of scores for building categories
	figsize(12, 10)

	# Plot each building
	for b_type in types:
	# Copy the original data
	features = data.copy()

	# Select the numeric columns
	numeric_subset = data.select_dtypes('number')

	# Create columns with log of numeric columns
	for col in numeric_subset.columns:
	# Skip the Energy Star Score column
	if col == 'score':
	# Function to calculate mean absolute error
	def mae(y_true, y_pred):
	return np.mean(abs(y_true - y_pred))

	baseline_guess = np.median(y)

	print('The baseline guess is a score of %0.2f' % baseline_guess)
	print("Baseline Performance on the test set: MAE = %0.4f" % mae(y_test, baseline_guess))
	# Extract the columns to plot
	plot_data = features[['score', 'Site EUI (kBtu/ft²)',
	'Weather Normalized Source EUI (kBtu/ft²)',
	'log_Total GHG Emissions (Metric Tons CO2e)']]

	# Replace the inf with nan
	plot_data = plot_data.replace({np.inf: np.nan, -np.inf: np.nan})

	# Rename columns
	plot_data = plot_data.rename(columns = {'Site EUI (kBtu/ft²)': 'Site EUI',
	from sklearn.ensemble import GradientBoostingRegressor

	# Create the model
	gradient_boosted = GradientBoostingRegressor()

	# Fit the model on the training data
	gradient_boosted.fit(X, y)

	# Make predictions on the test data
	predictions = gradient_boosted.predict(X_test)
	# Loss function to be optimized
	loss = ['ls', 'lad', 'huber']

	# Number of trees used in the boosting process
	n_estimators = [100, 500, 900, 1100, 1500]

	# Maximum depth of each tree
	max_depth = [2, 3, 5, 10, 15]

	# Minimum number of samples per leaf