Tara terrah27

## class_imbalance.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                terrah27
                / class_imbalance.ipynb
            
            
              Created
              February 3, 2019 20:38
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## dummy_classifier.py
# Separate input features and target
y = df.Class
X = df.drop('Class', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

# DummyClassifier to predict only target 0
dummy = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)

## lr_baseline.py
# Modeling the data as is
# Train model
lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)

# Predict on training set
lr_pred = lr.predict(X_test)

# Checking accuracy
accuracy_score(y_test, lr_pred)
    0.9992

## change_metric.py
# f1 score
f1_score(y_test, lr_pred)
    0.7522

# recall score
recall_score(y_test, lr_pred)
    0.6439

## change_algorithm.py
from sklearn.ensemble import RandomForestClassifier

# train model
rfc = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

# predict on test set
rfc_pred = rfc.predict(X_test)

accuracy_score(y_test, rfc_pred)
    0.9995

## upsampled.py
from sklearn.utils import resample

# Separate input features and target
y = df.Class
X = df.drop('Class', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

# concatenate our training data back together

## upsampled_lr.py
# trying logistic regression again with the balanced dataset
y_train = upsampled.Class
X_train = upsampled.drop('Class', axis=1)

upsampled = LogisticRegression(solver='liblinear').fit(X_train, y_train)

upsampled_pred = upsampled.predict(X_test)

# Checking accuracy
accuracy_score(y_test, upsampled_pred)

## downsampled.py
# still using our separated classes fraud and not_fraud from above

# downsample majority
not_fraud_downsampled = resample(not_fraud,
                                replace = False, # sample without replacement
                                n_samples = len(fraud), # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([not_fraud_downsampled, fraud])

## downsampled_lr.py
# trying logistic regression again with the undersampled dataset

y_train = downsampled.Class
X_train = downsampled.drop('Class', axis=1)

undersampled = LogisticRegression(solver='liblinear').fit(X_train, y_train)

undersampled_pred = undersampled.predict(X_test)

# Checking accuracy

## smote.py
from imblearn.over_sampling import SMOTE

# Separate input features and target
y = df.Class
X = df.drop('Class', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

sm = SMOTE(random_state=27, ratio=1.0)
	# Separate input features and target
	y = df.Class
	X = df.drop('Class', axis=1)

	# setting up testing and training sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

	# DummyClassifier to predict only target 0
	dummy = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
	dummy_pred = dummy.predict(X_test)
	# Modeling the data as is
	# Train model
	lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)

	# Predict on training set
	lr_pred = lr.predict(X_test)

	# Checking accuracy
	accuracy_score(y_test, lr_pred)
	0.9992
	# f1 score
	f1_score(y_test, lr_pred)
	0.7522

	# recall score
	recall_score(y_test, lr_pred)
	0.6439
	from sklearn.ensemble import RandomForestClassifier

	# train model
	rfc = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

	# predict on test set
	rfc_pred = rfc.predict(X_test)

	accuracy_score(y_test, rfc_pred)
	0.9995
	from sklearn.utils import resample

	# Separate input features and target
	y = df.Class
	X = df.drop('Class', axis=1)

	# setting up testing and training sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

	# concatenate our training data back together
	# trying logistic regression again with the balanced dataset
	y_train = upsampled.Class
	X_train = upsampled.drop('Class', axis=1)

	upsampled = LogisticRegression(solver='liblinear').fit(X_train, y_train)

	upsampled_pred = upsampled.predict(X_test)

	# Checking accuracy
	accuracy_score(y_test, upsampled_pred)
	# still using our separated classes fraud and not_fraud from above

	# downsample majority
	not_fraud_downsampled = resample(not_fraud,
	replace = False, # sample without replacement
	n_samples = len(fraud), # match minority n
	random_state = 27) # reproducible results

	# combine minority and downsampled majority
	downsampled = pd.concat([not_fraud_downsampled, fraud])
	# trying logistic regression again with the undersampled dataset

	y_train = downsampled.Class
	X_train = downsampled.drop('Class', axis=1)

	undersampled = LogisticRegression(solver='liblinear').fit(X_train, y_train)

	undersampled_pred = undersampled.predict(X_test)

	# Checking accuracy
	from imblearn.over_sampling import SMOTE

	# Separate input features and target
	y = df.Class
	X = df.drop('Class', axis=1)

	# setting up testing and training sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

	sm = SMOTE(random_state=27, ratio=1.0)