This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load the data | |
df = pd.read_csv(“data/HR_comma_sep.csv”) | |
# Check both the datatypes and if there is missing values | |
print(“\033[1m” + “\033[94m” + “Data types:\n” + 11 * “-”) | |
print(“\033[30m” + “{}\n”.format(df.dtypes)) | |
print(“\033[1m” + “\033[94m” + “Sum of null values in each column:\n” + 35 * “-”) | |
print(“\033[30m” + “{}”.format(df.isnull().sum())) | |
df.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Rename sales feature into department | |
df = df.rename(columns={"sales": "department"}) | |
# Map salary into integers | |
salary_map = {"low": 0, "medium": 1, "high": 2} | |
df["salary"] = df["salary"].map(salary_map) | |
# Create dummy variables for department feature | |
df = pd.get_dummies(df, columns=["department"], drop_first=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Get number of positve and negative examples | |
pos = df[df["left"] == 1].shape[0] | |
neg = df[df["left"] == 0].shape[0] | |
print("Positive examples = {}".format(pos)) | |
print("Negative examples = {}".format(neg)) | |
print("Proportion of positive to negative examples = {:.2f}%".format((pos / neg) * 100)) | |
sns.countplot(df["left"]) | |
plt.xticks((0, 1), ["Didn't leave", "Left"]) | |
plt.xlabel("Left") | |
plt.ylabel("Count") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Convert dataframe into numpy objects and split them into | |
# train and test sets: 80/20 | |
X = df.loc[:, df.columns != "left"].values | |
y = df.loc[:, df.columns == "left"].values.flatten() | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, y, test_size=0.2, stratify=y, random_state=1) | |
# Upsample minority class | |
X_train_u, y_train_u = resample(X_train[y_train == 1], | |
y_train[y_train == 1], | |
replace=True, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Build PCA using standarized trained data | |
pca = PCA(n_components=None, svd_solver="full") | |
pca.fit(StandardScaler().fit_transform(X_train)) | |
cum_var_exp = np.cumsum(pca.explained_variance_ratio_) | |
plt.figure(figsize=(12, 6)) | |
plt.bar(range(1, 18), pca.explained_variance_ratio_, align="center", | |
color='red', label="Individual explained variance") | |
plt.step(range(1, 18), cum_var_exp, where="mid", label="Cumulative explained variance") | |
plt.xticks(range(1, 18)) | |
plt.legend(loc="best") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Build random forest classifier | |
methods_data = {"Original": (X_train, y_train), | |
"Upsampled": (X_train_u, y_train_u), | |
"Downsampled": (X_train_d, y_train_d)} | |
for method in methods_data.keys(): | |
pip_rf = make_pipeline(StandardScaler(), | |
RandomForestClassifier(n_estimators=500, | |
class_weight="balanced", | |
random_state=123)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Build Gradient Boosting classifier | |
pip_gb = make_pipeline(StandardScaler(), | |
GradientBoostingClassifier(loss="deviance", | |
random_state=123)) | |
hyperparam_grid = {"gradientboostingclassifier__max_features": ["log2", 0.5], | |
"gradientboostingclassifier__n_estimators": [100, 300, 500], | |
"gradientboostingclassifier__learning_rate": [0.001, 0.01, 0.1], | |
"gradientboostingclassifier__max_depth": [1, 2, 3]} | |
gs_gb = GridSearchCV(pip_gb, | |
param_grid=hyperparam_grid, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Build KNN classifier | |
pip_knn = make_pipeline(StandardScaler(), KNeighborsClassifier()) | |
hyperparam_range = range(1, 20) | |
gs_knn = GridSearchCV(pip_knn, | |
param_grid={"kneighborsclassifier__n_neighbors": hyperparam_range, | |
"kneighborsclassifier__weights": ["uniform", "distance"]}, | |
scoring="f1", | |
cv=10, | |
n_jobs=-1) | |
gs_knn.fit(X_train, y_train) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Build logistic model classifier | |
pip_logmod = make_pipeline(StandardScaler(), | |
LogisticRegression(class_weight="balanced")) | |
hyperparam_range = np.arange(0.5, 20.1, 0.5) | |
hyperparam_grid = {"logisticregression__penalty": ["l1", "l2"], | |
"logisticregression__C": hyperparam_range, | |
"logisticregression__fit_intercept": [True, False] | |
} | |
gs_logmodel = GridSearchCV(pip_logmod, | |
hyperparam_grid, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Build SVM classifier | |
clf_svc = make_pipeline(StandardScaler(), | |
SVC(C=0.01, | |
gamma=0.1, | |
kernel="poly", | |
degree=5, | |
coef0=10, | |
probability=True)) | |
clf_svc.fit(X_train, y_train) | |
svc_cv_scores = cross_val_score(clf_svc, |
OlderNewer