Skip to content

Instantly share code, notes, and snippets.

View terrah27's full-sized avatar

Tara terrah27

  • Graham Healthcare Group
  • Pittsburgh, PA
  • X @terrah27
View GitHub Profile
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# Separate input features and target
y = df.Class
X = df.drop('Class', axis=1)
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)
# DummyClassifier to predict only target 0
dummy = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)
# Modeling the data as is
# Train model
lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
# Predict on training set
lr_pred = lr.predict(X_test)
# Checking accuracy
accuracy_score(y_test, lr_pred)
0.9992
# f1 score
f1_score(y_test, lr_pred)
0.7522
# recall score
recall_score(y_test, lr_pred)
0.6439
from sklearn.ensemble import RandomForestClassifier
# train model
rfc = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)
# predict on test set
rfc_pred = rfc.predict(X_test)
accuracy_score(y_test, rfc_pred)
0.9995
from sklearn.utils import resample
# Separate input features and target
y = df.Class
X = df.drop('Class', axis=1)
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)
# concatenate our training data back together
# trying logistic regression again with the balanced dataset
y_train = upsampled.Class
X_train = upsampled.drop('Class', axis=1)
upsampled = LogisticRegression(solver='liblinear').fit(X_train, y_train)
upsampled_pred = upsampled.predict(X_test)
# Checking accuracy
accuracy_score(y_test, upsampled_pred)
# still using our separated classes fraud and not_fraud from above
# downsample majority
not_fraud_downsampled = resample(not_fraud,
replace = False, # sample without replacement
n_samples = len(fraud), # match minority n
random_state = 27) # reproducible results
# combine minority and downsampled majority
downsampled = pd.concat([not_fraud_downsampled, fraud])
# trying logistic regression again with the undersampled dataset
y_train = downsampled.Class
X_train = downsampled.drop('Class', axis=1)
undersampled = LogisticRegression(solver='liblinear').fit(X_train, y_train)
undersampled_pred = undersampled.predict(X_test)
# Checking accuracy
from imblearn.over_sampling import SMOTE
# Separate input features and target
y = df.Class
X = df.drop('Class', axis=1)
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)
sm = SMOTE(random_state=27, ratio=1.0)