Skip to content

Instantly share code, notes, and snippets.

@egemenzeytinci
Created December 26, 2019 21:54
Show Gist options
  • Save egemenzeytinci/c0a4effd9e75809d862ebea7a34da694 to your computer and use it in GitHub Desktop.
Save egemenzeytinci/c0a4effd9e75809d862ebea7a34da694 to your computer and use it in GitHub Desktop.
def compare():
for is_le in [True, False]:
method = 'label encoder'
if is_le:
selected = df_le[selects_le + ['is_canceled']]
else:
selected = df_hot[selects_hot + ['is_canceled']]
method = 'dummy variables'
# separate majority and minority classes
major = selected[selected['is_canceled'] == 0]
minor = selected[selected['is_canceled'] == 1]
# downsample majority class
downsampled = resample(major, replace=False, n_samples=len(minor), random_state=123)
# combine minority class with downsampled majority class
df_new = pd.concat([downsampled, minor])
X = df_new.drop('is_canceled', axis=1)
y = df_new['is_canceled']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
log = LogisticRegression().fit(X_train, y_train)
y_pred = log.predict(X_test)
print(f'Accuracy for {method}: {accuracy_score(y_test, y_pred)}')
print(f'Classification report for {method}:\n{classification_report(y_test, y_pred)}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment