Skip to content

Instantly share code, notes, and snippets.

@shilpaleo
Last active June 9, 2021 03:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shilpaleo/ec207f02fdc13f6740f0991c412f6cf7 to your computer and use it in GitHub Desktop.
Save shilpaleo/ec207f02fdc13f6740f0991c412f6cf7 to your computer and use it in GitHub Desktop.

Battle of the Auto ML titans for People Analytics application

Here's the link to the medium post

# importing libraries
import pandas as pd
import numpy as np
#auto ML library 1 pycaret imports
import pycaret
from pycaret import classification
#auto ML library 2 tpot imports
import tpot
from tpot import TPOTClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
# importing dataset
data_df = pd.read_csv('../data/HR_comma_sep.csv') #../ goes one folder back
print(data_df.shape)
data_df.head()
# checking columns in dataset for missing values
data_df['salary'].isna().sum()
exp_clf101 = classification.setup(data = data_df, target = 'left', session_id=123)
exp_clf101 = classification.setup(data = data_df, target = 'left', session_id=123, numeric_features = ['number_project', 'time_spend_company', 'Work_accident', 'promotion_last_5years'])
def salary_num(val):
if val=='low':
return 0
elif val=='medium':
return 1
else:
return 2
data_df['salary'] = data_df['salary'].apply(salary_num)
data_df.head()
best_model = classification.compare_models(sort='F1')
# train split confusion matrix
classification.plot_model(best_model, plot = 'confusion_matrix', use_train_data=True)
# test split confusion matrix
classification.plot_model(best_model, plot = 'confusion_matrix')
copy_data_df = data_df.copy()
classification.predict_model(best_model, data=copy_data_df)
# splitting features (X) and response (y)
data = data_df.values
X,y = data[:, 1:], data[:, 0]
print(X.shape, y.shape)
# splitting train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=1)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search
model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='f1', verbosity=2, random_state=1, n_jobs=6)
# fitting the model on train features-response
model.fit(X_train, y_train)
# evaluation metric imports
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
# visualization imports
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# plotting metrics
def plot_metrics(x, y, model):
y_pred = model.predict(x)
cm = confusion_matrix(y, y_pred)
class_names = [0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.Dataframe(cm), annot=True, cmap="Y1GnBu", fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('confusion matrix', y=1.1)
plt.ylabel('actual')
plt.xlabel('predicted')
print(f'accuracy: {accuracy_score(y,y_pred)}')
print(f'precision: {precision_score(y,y_pred)}')
print(f'recall: {recall_score(y,y_pred)}')
print(f'auc: {roc_auc_score(y,y_pred)}')
print(f'f1: {f1_score(y,y_pred)}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment