netsatsawat/prediction_evaluation.py

## prediction_evaluation.py
def prediction_evaluation (algorithm, X_train, X_test, y_train, y_test,
                           predictor_cols, cf = 'features'):
    """
     Function to predict and evaluate the provided algorithm by using Plotly library
       to visualize the confusion matrix, ROC curve as well as provided the feature importances.
     @Args:
       algorithm: the model algorithm object
       X_train: the predictor features of the training pandas data frame
       X_test: the predictor features of the testing pandas data frame
       y_train: the target variable of the training pandas data frame
       y_test: the target variable of the testing pandas data frame
       cf: toggle the mode on how to get the informaiton out from the model,
         the input only accepts 2 possible list of values.
         LOV - 'coefficients': specifically for logistic regression
             - 'features': specifically for tree-based model
     Return:
        prediction and probabilities
    """
    if cf not in ['features', 'coefficients']:
        # Exception case - return None
        print("ERROR: Mode Toggle (cf parameters) is not in LOV. Please recheck")
        return None, None

    algorithm.fit(X_train, y_train)
    y_pred = algorithm.predict(X_test)
    y_prob = algorithm.predict_proba(X_test)
    algorithm_name = str(algorithm).split('(', 1)[0]

    if cf == 'coefficients':
        coeff = pd.DataFrame(algorithm.coef_.ravel())
    elif cf == 'features':
        coeff = pd.DataFrame(algorithm.feature_importances_)

    col_df = pd.DataFrame(predictor_cols)
    coef_smry = pd.merge(coeff, col_df, left_index=True, right_index=True, how='left')
    coef_smry.columns = ['coefficients', 'features']
    coef_smry = coef_smry.sort_values(by='coefficients', ascending=False)
    conf_matrix = confusion_matrix(y_test, y_pred)
    # compute metric
    tp = conf_matrix[1,1]
    fn = conf_matrix[1,0]
    fp = conf_matrix[0,1]
    tn = conf_matrix[0,0]
    accuracy_  = ((tp + tn) / (tp + tn + fp + fn))
    precision_ = (tp / (tp + fp))
    recall_    = (tp / (tp + fn))
    f1_score_  = f1_score(y_test, y_pred)
    model_roc_auc = roc_auc_score(y_test, y_pred)

    # Print report
    print(algorithm)
    print("\nClassification report: \n", classification_report(y_test, y_pred))
    print("\nAccuracy Score: ", np.round(accuracy_score(y_test, y_pred), 4))
    print("F1 Score: ", np.round(f1_score_, 4))
    print("Area Under Curve: ", np.round(model_roc_auc, 4), "\n")

    # Trace 1: plot confusion matrix
    trace1 = go.Heatmap(z = conf_matrix,
                        x = ['Not Leave', 'Leave'],
                        y = ['Not Leave', 'Leave'],
                        showscale = False,
                        colorscale = 'Picnic',
                        name = "Confusion Matrix"
                       )

    # Trace 2: plot model metrics
    show_metrics = pd.DataFrame(data=[[accuracy_ , precision_, recall_, f1_score_]])
    show_metrics = show_metrics.T
    colors = ['gold', 'lightgreen', 'lightcoral', 'lightskyblue']
    trace2 = go.Bar(x=(show_metrics[0].values),
                    y=['Accuracy', 'Precision', 'Recall', 'F1 score'],
                    text=np.round_(show_metrics[0].values,4),
                    name='',
                    textposition='auto',
                    orientation='h',
                    opacity=0.8,
                    marker=dict(color=colors,
                                line=dict(color='#000000',
                                          width=1.5)
                               )
                   )

    # Trace 3: plot ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_prob[:, 1])
    trace3 = go.Scatter(x = fpr,
                        y = tpr,
                        name = "ROC: " + str(model_roc_auc),
                        line = dict(color = 'rgb(22, 96, 197)',
                                    width = 2
                                   )
                       )
    trace4 = go.Scatter(x = [0, 1],
                        y = [0, 1],
                        line = dict(color = 'rgb(205, 12, 24)',
                                     width = 1.5,
                                     dash = 'dot'
                                   )
                       )

    # Trace 4: plot precision-recall curve
    __precision, __recall, t = precision_recall_curve(y_test, y_prob[:, 1])
    trace5 = go.Scatter(x=__recall,
                        y=__precision,
                        name="Precision %s" % str(__precision),
                        line=dict(color=('lightcoral'),
                                  width = 2),
                        fill='tozeroy'
                       )

    # Trace 5: plot coeffs
    trace6 = go.Bar(x = coef_smry['features'],
                    y = coef_smry['coefficients'],
                    name = "coefficients",
                    marker = dict(color = coef_smry['coefficients'],
                                  colorscale = 'Picnic',
                                  line = dict(width = .6, color = 'black')
                                 )
                   )

    # subplots
    fig = tls.make_subplots(rows = 3, cols = 2,
                            specs = [[{}, {}],
                                     [{}, {}],
                                     [{'colspan': 2}, None]],
                            subplot_titles = ('Confusion Matrix',
                                              'Metrics',
                                              'Receiver Operating Characteristics (ROC)',
                                              'Precision - Recall curve',
                                              'Feature Importances'
                                             )
                           )

    fig.append_trace(trace1, 1, 1)
    fig.append_trace(trace2, 1, 2)
    fig.append_trace(trace3, 2, 1)
    fig.append_trace(trace4, 2, 1)
    fig.append_trace(trace5, 2, 2)
    fig.append_trace(trace6, 3, 1)

    fig['layout'].update(showlegend = False, title = "Model Performance of {}".format(algorithm_name),
                         autosize = False,
                         height = 1000,
                         width = 800,
                         plot_bgcolor = 'rgba(240, 240, 240, 0.95)',
                         paper_bgcolor = 'rgba(240, 240, 240, 0.95)',
                         margin = dict(b = 195)
                        )
    fig['layout']['xaxis1'].update(dict(title="Prediction"))
    fig['layout']['yaxis1'].update(dict(title="Actual"))
    fig["layout"]["xaxis2"].update((dict(range=[0, 1])))
    fig['layout']['xaxis3'].update(dict(title="False Positive Rate"))
    fig['layout']['yaxis3'].update(dict(title='True Positive Rate'))
    fig["layout"]["xaxis4"].update(dict(title="recall"),
                                   range=[0, 1.05])
    fig["layout"]["yaxis4"].update(dict(title="precision"),
                                   range=[0, 1.05])
    fig['layout']['xaxis5'].update(dict(showgrid=True,
                                        tickfont=dict(size = 10),
                                        tickangle=90
                                       )
                                  )
    fig.layout.titlefont.size = 14
    py.iplot(fig)
    return y_pred, y_prob
	def prediction_evaluation (algorithm, X_train, X_test, y_train, y_test,
	predictor_cols, cf = 'features'):
	"""
	Function to predict and evaluate the provided algorithm by using Plotly library
	to visualize the confusion matrix, ROC curve as well as provided the feature importances.
	@Args:
	algorithm: the model algorithm object
	X_train: the predictor features of the training pandas data frame
	X_test: the predictor features of the testing pandas data frame
	y_train: the target variable of the training pandas data frame
	y_test: the target variable of the testing pandas data frame
	cf: toggle the mode on how to get the informaiton out from the model,
	the input only accepts 2 possible list of values.
	LOV - 'coefficients': specifically for logistic regression
	- 'features': specifically for tree-based model
	Return:
	prediction and probabilities
	"""
	if cf not in ['features', 'coefficients']:
	# Exception case - return None
	print("ERROR: Mode Toggle (cf parameters) is not in LOV. Please recheck")
	return None, None

	algorithm.fit(X_train, y_train)
	y_pred = algorithm.predict(X_test)
	y_prob = algorithm.predict_proba(X_test)
	algorithm_name = str(algorithm).split('(', 1)[0]

	if cf == 'coefficients':
	coeff = pd.DataFrame(algorithm.coef_.ravel())
	elif cf == 'features':
	coeff = pd.DataFrame(algorithm.feature_importances_)

	col_df = pd.DataFrame(predictor_cols)
	coef_smry = pd.merge(coeff, col_df, left_index=True, right_index=True, how='left')
	coef_smry.columns = ['coefficients', 'features']
	coef_smry = coef_smry.sort_values(by='coefficients', ascending=False)
	conf_matrix = confusion_matrix(y_test, y_pred)
	# compute metric
	tp = conf_matrix[1,1]
	fn = conf_matrix[1,0]
	fp = conf_matrix[0,1]
	tn = conf_matrix[0,0]
	accuracy_ = ((tp + tn) / (tp + tn + fp + fn))
	precision_ = (tp / (tp + fp))
	recall_ = (tp / (tp + fn))
	f1_score_ = f1_score(y_test, y_pred)
	model_roc_auc = roc_auc_score(y_test, y_pred)

	# Print report
	print(algorithm)
	print("\nClassification report: \n", classification_report(y_test, y_pred))
	print("\nAccuracy Score: ", np.round(accuracy_score(y_test, y_pred), 4))
	print("F1 Score: ", np.round(f1_score_, 4))
	print("Area Under Curve: ", np.round(model_roc_auc, 4), "\n")

	# Trace 1: plot confusion matrix
	trace1 = go.Heatmap(z = conf_matrix,
	x = ['Not Leave', 'Leave'],
	y = ['Not Leave', 'Leave'],
	showscale = False,
	colorscale = 'Picnic',
	name = "Confusion Matrix"
	)

	# Trace 2: plot model metrics
	show_metrics = pd.DataFrame(data=[[accuracy_ , precision_, recall_, f1_score_]])
	show_metrics = show_metrics.T
	colors = ['gold', 'lightgreen', 'lightcoral', 'lightskyblue']
	trace2 = go.Bar(x=(show_metrics[0].values),
	y=['Accuracy', 'Precision', 'Recall', 'F1 score'],
	text=np.round_(show_metrics[0].values,4),
	name='',
	textposition='auto',
	orientation='h',
	opacity=0.8,
	marker=dict(color=colors,
	line=dict(color='#000000',
	width=1.5)
	)
	)

	# Trace 3: plot ROC curve
	fpr, tpr, thresholds = roc_curve(y_test, y_prob[:, 1])
	trace3 = go.Scatter(x = fpr,
	y = tpr,
	name = "ROC: " + str(model_roc_auc),
	line = dict(color = 'rgb(22, 96, 197)',
	width = 2
	)
	)
	trace4 = go.Scatter(x = [0, 1],
	y = [0, 1],
	line = dict(color = 'rgb(205, 12, 24)',
	width = 1.5,
	dash = 'dot'
	)
	)

	# Trace 4: plot precision-recall curve
	__precision, __recall, t = precision_recall_curve(y_test, y_prob[:, 1])
	trace5 = go.Scatter(x=__recall,
	y=__precision,
	name="Precision %s" % str(__precision),
	line=dict(color=('lightcoral'),
	width = 2),
	fill='tozeroy'
	)

	# Trace 5: plot coeffs
	trace6 = go.Bar(x = coef_smry['features'],
	y = coef_smry['coefficients'],
	name = "coefficients",
	marker = dict(color = coef_smry['coefficients'],
	colorscale = 'Picnic',
	line = dict(width = .6, color = 'black')
	)
	)

	# subplots
	fig = tls.make_subplots(rows = 3, cols = 2,
	specs = [[{}, {}],
	[{}, {}],
	[{'colspan': 2}, None]],
	subplot_titles = ('Confusion Matrix',
	'Metrics',
	'Receiver Operating Characteristics (ROC)',
	'Precision - Recall curve',
	'Feature Importances'
	)
	)

	fig.append_trace(trace1, 1, 1)
	fig.append_trace(trace2, 1, 2)
	fig.append_trace(trace3, 2, 1)
	fig.append_trace(trace4, 2, 1)
	fig.append_trace(trace5, 2, 2)
	fig.append_trace(trace6, 3, 1)

	fig['layout'].update(showlegend = False, title = "Model Performance of {}".format(algorithm_name),
	autosize = False,
	height = 1000,
	width = 800,
	plot_bgcolor = 'rgba(240, 240, 240, 0.95)',
	paper_bgcolor = 'rgba(240, 240, 240, 0.95)',
	margin = dict(b = 195)
	)
	fig['layout']['xaxis1'].update(dict(title="Prediction"))
	fig['layout']['yaxis1'].update(dict(title="Actual"))
	fig["layout"]["xaxis2"].update((dict(range=[0, 1])))
	fig['layout']['xaxis3'].update(dict(title="False Positive Rate"))
	fig['layout']['yaxis3'].update(dict(title='True Positive Rate'))
	fig["layout"]["xaxis4"].update(dict(title="recall"),
	range=[0, 1.05])
	fig["layout"]["yaxis4"].update(dict(title="precision"),
	range=[0, 1.05])
	fig['layout']['xaxis5'].update(dict(showgrid=True,
	tickfont=dict(size = 10),
	tickangle=90
	)
	)
	fig.layout.titlefont.size = 14
	py.iplot(fig)
	return y_pred, y_prob