Skip to content

Instantly share code, notes, and snippets.

df_top_features = df_all_features.head(10).sort_values(by='importance', ascending=True)
plt.figure(figsize=(10, 6))
plt.barh(df_top_features['feature'], df_top_features['importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Top 10 Feature Importances')
for index, value in enumerate(df_top_features['importance']):
plt.text(value, index, f'{value:.4f}', va='center')
arr_feature_importances = rfc.feature_importances_
arr_feature_names = X_train.columns.values
df_feature_importance = pd.DataFrame(index=range(len(arr_feature_importances)), columns=['feature', 'importance'])
df_feature_importance['feature'] = arr_feature_names
df_feature_importance['importance'] = arr_feature_importances
df_all_features = df_feature_importance.sort_values(by='importance', ascending=False)
df_all_features
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
model = RandomForestClassifier()
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, Y_train, cv=k_fold, scoring='accuracy')
for i, score in enumerate(scores, 1):
KS = max(df_actual_predicted['Cumulative Perc Good'] - df_actual_predicted['Cumulative Perc Bad'])
plt.plot(df_actual_predicted['y_pred_proba'], df_actual_predicted['Cumulative Perc Bad'], color='r')
plt.plot(df_actual_predicted['y_pred_proba'], df_actual_predicted['Cumulative Perc Good'], color='b')
plt.xlabel('Estimated Probability for Being Bad')
plt.ylabel('Cumulative %')
plt.title('Kolmogorov-Smirnov: %0.4f' %KS)
df_actual_predicted = df_actual_predicted.sort_values('y_pred_proba')
df_actual_predicted = df_actual_predicted.reset_index()
df_actual_predicted['Cumulative N Population'] = df_actual_predicted.index + 1
df_actual_predicted['Cumulative N Bad'] = df_actual_predicted['y_actual'].cumsum()
df_actual_predicted['Cumulative N Good'] = df_actual_predicted['Cumulative N Population'] - df_actual_predicted['Cumulative N Bad']
df_actual_predicted['Cumulative Perc Population'] = df_actual_predicted['Cumulative N Population'] / df_actual_predicted.shape[0]
df_actual_predicted['Cumulative Perc Bad'] = df_actual_predicted['Cumulative N Bad'] / df_actual_predicted['y_actual'].sum()
df_actual_predicted['Cumulative Perc Good'] = df_actual_predicted['Cumulative N Good'] / (df_actual_predicted.shape[0] - df_actual_predicted['y_actual'].sum())
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, tr = roc_curve(df_actual_predicted['y_actual'], df_actual_predicted['y_pred_proba'])
auc = roc_auc_score(df_actual_predicted['y_actual'], df_actual_predicted['y_pred_proba'])
plt.plot(fpr, tpr, label='AUC = %0.4f' %auc)
plt.plot(fpr, fpr, linestyle = '--', color='k')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, Y_train)
y_pred_proba = rfc.predict_proba(X_test)[:][:,1]
df_actual_predicted = pd.concat([pd.DataFrame(np.array(Y_test), columns=['y_actual']),
pd.DataFrame(y_pred_proba, columns=['y_pred_proba'])], axis=1)
df_actual_predicted.index = Y_test.index
Y_train = Y_train.map({'good': 1, 'bad': 0})
Y_train = Y_train.astype(int)
Y_test = Y_test.map({'good': 1, 'bad': 0})
Y_test = Y_test.astype(int)
plt.figure(figsize=(10, 6))
bars = plt.bar(model_names, accuracies, color='skyblue')
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.2f}', ha='center', va='bottom')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Accuracy of Different Models')
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report
results = {}
models = {
'Random Forest': RandomForestClassifier(random_state=42),