Will Koehrsen WillKoehrsen

## feature_importance_extraction.py
import pandas as pd

# model is the trained model
importances = model.feature_importances_

# train_features is the dataframe of training features
feature_list = list(train_features.columns)

# Extract the feature importances into a dataframe
feature_results = pd.DataFrame({'feature': feature_list,

## visualize_decision_tree.py
from sklearn import tree

# Extract a single tree (number 105)
single_tree = model.estimators_[105][0]

# Save the tree to a dot file
tree.export_graphviz(single_tree, out_file = 'images/tree.dot',
                     feature_names = feature_list)

## gradient_boosted_reduced_model.py
from sklearn.ensemble import GradientBoostingRegressor

# Create the model with the best hyperparamters
model_reduced = GradientBoostingRegressor(loss='lad', max_depth=5, max_features=None,
                                  min_samples_leaf=6, min_samples_split=6,
                                  n_estimators=800, random_state=42)

# Fit and test on the reduced set of features
model_reduced.fit(X_reduced, y)
model_reduced_pred = model_reduced.predict(X_test_reduced)

## lime_gradient_boosted_explaination.py
import lime

# Create a lime explainer object
explainer = lime.lime_tabular.LimeTabularExplainer(training_data = X,
                                                   mode = 'regression',
                                                   training_labels = y,
                                                   feature_names = feature_list)


# Explanation for wrong prediction

## extracting_wrong_prediction.py
from sklearn.ensemble import GradientBoostingRegressor

# Create the model with the best hyperparamters
model = GradientBoostingRegressor(loss='lad', max_depth=5, max_features=None,
                                  min_samples_leaf=6, min_samples_split=6,
                                  n_estimators=800, random_state=42)

# Fit and test on the features
model.fit(X, y)
model_pred = model.predict(X_test)

## tpot_regressor.py
from tpot import TPOTRegressor

# Create a tpot object with a specified parameters
tpot = TPOTRegressor(max_time_mins = 240, n_jobs = -1,
                     scoring = 'neg_mean_absolute_error', verbosity = 2,)

## tpot_regressor.py
# Import the optimizer class
from tpot import TPOTRegressor

# Create a tpot optimizer with parameters
tpot = TPOTRegressor(scoring = 'neg_mean_absolute_error',
                     max_time_mins = 480,
                     n_jobs = -1,
                     verbosity = 2,
                     cv = 5)

## tpot_optimized_ml_pipeline.py
# Preprocessing steps
imputer = Imputer(strategy="median")
imputer.fit(training_features)
training_features = imputer.transform(training_features)
testing_features = imputer.transform(testing_features)

# Final pipeline from TPOT
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    GradientBoostingRegressor(alpha=0.95, learning_rate=0.1, loss="lad",

## manual_feature_engineering.py
import pandas as pd

# Group loans by client id and calculate mean, max, min of loans
stats = loans.groupby('client_id')['loan_amount'].agg(['mean', 'max', 'min'])
stats.columns = ['mean_loan_amount', 'max_loan_amount', 'min_loan_amount']

# Merge with the clients dataframe
stats = clients.merge(stats, left_on = 'client_id', right_index=True, how = 'left')

stats.head(10)

## loan_entity.py
# Create an entity from the loans dataframe
# This dataframe already has an index and a time index
es = es.entity_from_dataframe(entity_id = 'loans', dataframe = loans,
                              variable_types = {'repaid': ft.variable_types.Categorical},
                              index = 'loan_id',
                              time_index = 'loan_start')
	import pandas as pd

	# model is the trained model
	importances = model.feature_importances_

	# train_features is the dataframe of training features
	feature_list = list(train_features.columns)

	# Extract the feature importances into a dataframe
	feature_results = pd.DataFrame({'feature': feature_list,
	from sklearn import tree

	# Extract a single tree (number 105)
	single_tree = model.estimators_[105][0]

	# Save the tree to a dot file
	tree.export_graphviz(single_tree, out_file = 'images/tree.dot',
	feature_names = feature_list)
	from sklearn.ensemble import GradientBoostingRegressor

	# Create the model with the best hyperparamters
	model_reduced = GradientBoostingRegressor(loss='lad', max_depth=5, max_features=None,
	min_samples_leaf=6, min_samples_split=6,
	n_estimators=800, random_state=42)

	# Fit and test on the reduced set of features
	model_reduced.fit(X_reduced, y)
	model_reduced_pred = model_reduced.predict(X_test_reduced)
	import lime

	# Create a lime explainer object
	explainer = lime.lime_tabular.LimeTabularExplainer(training_data = X,
	mode = 'regression',
	training_labels = y,
	feature_names = feature_list)


	# Explanation for wrong prediction
	from sklearn.ensemble import GradientBoostingRegressor

	# Create the model with the best hyperparamters
	model = GradientBoostingRegressor(loss='lad', max_depth=5, max_features=None,
	min_samples_leaf=6, min_samples_split=6,
	n_estimators=800, random_state=42)

	# Fit and test on the features
	model.fit(X, y)
	model_pred = model.predict(X_test)
	from tpot import TPOTRegressor

	# Create a tpot object with a specified parameters
	tpot = TPOTRegressor(max_time_mins = 240, n_jobs = -1,
	scoring = 'neg_mean_absolute_error', verbosity = 2,)
	# Import the optimizer class
	from tpot import TPOTRegressor

	# Create a tpot optimizer with parameters
	tpot = TPOTRegressor(scoring = 'neg_mean_absolute_error',
	max_time_mins = 480,
	n_jobs = -1,
	verbosity = 2,
	cv = 5)
	# Preprocessing steps
	imputer = Imputer(strategy="median")
	imputer.fit(training_features)
	training_features = imputer.transform(training_features)
	testing_features = imputer.transform(testing_features)

	# Final pipeline from TPOT
	exported_pipeline = make_pipeline(
	StackingEstimator(estimator=LassoLarsCV(normalize=True)),
	GradientBoostingRegressor(alpha=0.95, learning_rate=0.1, loss="lad",
	import pandas as pd

	# Group loans by client id and calculate mean, max, min of loans
	stats = loans.groupby('client_id')['loan_amount'].agg(['mean', 'max', 'min'])
	stats.columns = ['mean_loan_amount', 'max_loan_amount', 'min_loan_amount']

	# Merge with the clients dataframe
	stats = clients.merge(stats, left_on = 'client_id', right_index=True, how = 'left')

	stats.head(10)
	# Create an entity from the loans dataframe
	# This dataframe already has an index and a time index
	es = es.entity_from_dataframe(entity_id = 'loans', dataframe = loans,
	variable_types = {'repaid': ft.variable_types.Categorical},
	index = 'loan_id',
	time_index = 'loan_start')