Skip to content

Instantly share code, notes, and snippets.

View ImadDabbura's full-sized avatar
🎯
Focusing

Imad Dabbura ImadDabbura

🎯
Focusing
View GitHub Profile
# Load the data
df = pd.read_csv(“data/HR_comma_sep.csv”)
# Check both the datatypes and if there is missing values
print(“\033[1m” + “\033[94m” + “Data types:\n” + 11 * “-”)
print(“\033[30m” + “{}\n”.format(df.dtypes))
print(“\033[1m” + “\033[94m” + “Sum of null values in each column:\n” + 35 * “-”)
print(“\033[30m” + “{}”.format(df.isnull().sum()))
df.head()
# Rename sales feature into department
df = df.rename(columns={"sales": "department"})
# Map salary into integers
salary_map = {"low": 0, "medium": 1, "high": 2}
df["salary"] = df["salary"].map(salary_map)
# Create dummy variables for department feature
df = pd.get_dummies(df, columns=["department"], drop_first=True)
# Get number of positve and negative examples
pos = df[df["left"] == 1].shape[0]
neg = df[df["left"] == 0].shape[0]
print("Positive examples = {}".format(pos))
print("Negative examples = {}".format(neg))
print("Proportion of positive to negative examples = {:.2f}%".format((pos / neg) * 100))
sns.countplot(df["left"])
plt.xticks((0, 1), ["Didn't leave", "Left"])
plt.xlabel("Left")
plt.ylabel("Count")
# Convert dataframe into numpy objects and split them into
# train and test sets: 80/20
X = df.loc[:, df.columns != "left"].values
y = df.loc[:, df.columns == "left"].values.flatten()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=1)
# Upsample minority class
X_train_u, y_train_u = resample(X_train[y_train == 1],
y_train[y_train == 1],
replace=True,
# Build PCA using standarized trained data
pca = PCA(n_components=None, svd_solver="full")
pca.fit(StandardScaler().fit_transform(X_train))
cum_var_exp = np.cumsum(pca.explained_variance_ratio_)
plt.figure(figsize=(12, 6))
plt.bar(range(1, 18), pca.explained_variance_ratio_, align="center",
color='red', label="Individual explained variance")
plt.step(range(1, 18), cum_var_exp, where="mid", label="Cumulative explained variance")
plt.xticks(range(1, 18))
plt.legend(loc="best")
# Build random forest classifier
methods_data = {"Original": (X_train, y_train),
"Upsampled": (X_train_u, y_train_u),
"Downsampled": (X_train_d, y_train_d)}
for method in methods_data.keys():
pip_rf = make_pipeline(StandardScaler(),
RandomForestClassifier(n_estimators=500,
class_weight="balanced",
random_state=123))
# Build Gradient Boosting classifier
pip_gb = make_pipeline(StandardScaler(),
GradientBoostingClassifier(loss="deviance",
random_state=123))
hyperparam_grid = {"gradientboostingclassifier__max_features": ["log2", 0.5],
"gradientboostingclassifier__n_estimators": [100, 300, 500],
"gradientboostingclassifier__learning_rate": [0.001, 0.01, 0.1],
"gradientboostingclassifier__max_depth": [1, 2, 3]}
gs_gb = GridSearchCV(pip_gb,
param_grid=hyperparam_grid,
# Build KNN classifier
pip_knn = make_pipeline(StandardScaler(), KNeighborsClassifier())
hyperparam_range = range(1, 20)
gs_knn = GridSearchCV(pip_knn,
param_grid={"kneighborsclassifier__n_neighbors": hyperparam_range,
"kneighborsclassifier__weights": ["uniform", "distance"]},
scoring="f1",
cv=10,
n_jobs=-1)
gs_knn.fit(X_train, y_train)
# Build logistic model classifier
pip_logmod = make_pipeline(StandardScaler(),
LogisticRegression(class_weight="balanced"))
hyperparam_range = np.arange(0.5, 20.1, 0.5)
hyperparam_grid = {"logisticregression__penalty": ["l1", "l2"],
"logisticregression__C": hyperparam_range,
"logisticregression__fit_intercept": [True, False]
}
gs_logmodel = GridSearchCV(pip_logmod,
hyperparam_grid,
# Build SVM classifier
clf_svc = make_pipeline(StandardScaler(),
SVC(C=0.01,
gamma=0.1,
kernel="poly",
degree=5,
coef0=10,
probability=True))
clf_svc.fit(X_train, y_train)
svc_cv_scores = cross_val_score(clf_svc,