yuyasugano/sklearn.py

## sklearn.py
# train test split, we are not using test data set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.33, random_state=39)

print('X_train shape: {}'.format(X_train.shape))
print('X_test shape: {}'.format(X_test.shape))
print('y_train shape: {}'.format(y_train.shape))
print('y_test shape: {}'.format(y_test.shape))

# pick three algorithms Multinomial Naive Bayes, RandomForest and GradientBoosting
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

clf_mnb = MultinomialNB()
clf_rfc = RandomForestClassifier(random_state=39)
clf_gbc = GradientBoostingClassifier(random_state=39)
clf_names = ['MultinomialNB', 'RandomForest', 'GradientBoosting']
clf_types = [clf_mnb, clf_rfc, clf_gbc]

for (i, clf) in enumerate(clf_types):
    clf.fit(X_train, y_train.values.ravel())
    print('Result of {}\n'.format(clf_names[i]))
    predictions = clf.predict(X_test)
    print(classification_report(predictions, y_test))
    print('\n')
    print('Confusion matrix: \n', confusion_matrix(predictions, y_test))
    print('\n')
    print('Accuracy score: ', accuracy_score(predictions, y_test))
    print('\n\n\n')

# results were given
Result of MultinomialNB

              precision    recall  f1-score   support

           0       0.92      0.95      0.93      9546
           1       0.95      0.92      0.93     10070

    accuracy                           0.93     19616
   macro avg       0.93      0.93      0.93     19616
weighted avg       0.93      0.93      0.93     19616


Confusion matrix:
 [[9044  502]
 [ 820 9250]]


Accuracy score:  0.9326060358890701


Result of RandomForest

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      9632
           1       0.99      0.97      0.98      9984

    accuracy                           0.98     19616
   macro avg       0.98      0.98      0.98     19616
weighted avg       0.98      0.98      0.98     19616


Confusion matrix:
 [[9525  107]
 [ 339 9645]]


Accuracy score:  0.977263458401305


Result of GradientBoosting

              precision    recall  f1-score   support

           0       0.95      0.73      0.82     12933
           1       0.64      0.93      0.76      6683

    accuracy                           0.80     19616
   macro avg       0.80      0.83      0.79     19616
weighted avg       0.85      0.80      0.80     19616


Confusion matrix:
 [[9397 3536]
 [ 467 6216]]


Accuracy score:  0.7959318923327896
	# train test split, we are not using test data set
	from sklearn.model_selection import train_test_split

	X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.33, random_state=39)

	print('X_train shape: {}'.format(X_train.shape))
	print('X_test shape: {}'.format(X_test.shape))
	print('y_train shape: {}'.format(y_train.shape))
	print('y_test shape: {}'.format(y_test.shape))

	# pick three algorithms Multinomial Naive Bayes, RandomForest and GradientBoosting
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

	clf_mnb = MultinomialNB()
	clf_rfc = RandomForestClassifier(random_state=39)
	clf_gbc = GradientBoostingClassifier(random_state=39)
	clf_names = ['MultinomialNB', 'RandomForest', 'GradientBoosting']
	clf_types = [clf_mnb, clf_rfc, clf_gbc]

	for (i, clf) in enumerate(clf_types):
	clf.fit(X_train, y_train.values.ravel())
	print('Result of {}\n'.format(clf_names[i]))
	predictions = clf.predict(X_test)
	print(classification_report(predictions, y_test))
	print('\n')
	print('Confusion matrix: \n', confusion_matrix(predictions, y_test))
	print('\n')
	print('Accuracy score: ', accuracy_score(predictions, y_test))
	print('\n\n\n')

	# results were given
	Result of MultinomialNB

	precision recall f1-score support

	0 0.92 0.95 0.93 9546
	1 0.95 0.92 0.93 10070

	accuracy 0.93 19616
	macro avg 0.93 0.93 0.93 19616
	weighted avg 0.93 0.93 0.93 19616



	Confusion matrix:
	[[9044 502]
	[ 820 9250]]


	Accuracy score: 0.9326060358890701




	Result of RandomForest

	precision recall f1-score support

	0 0.97 0.99 0.98 9632
	1 0.99 0.97 0.98 9984

	accuracy 0.98 19616
	macro avg 0.98 0.98 0.98 19616
	weighted avg 0.98 0.98 0.98 19616



	Confusion matrix:
	[[9525 107]
	[ 339 9645]]


	Accuracy score: 0.977263458401305




	Result of GradientBoosting

	precision recall f1-score support

	0 0.95 0.73 0.82 12933
	1 0.64 0.93 0.76 6683

	accuracy 0.80 19616
	macro avg 0.80 0.83 0.79 19616
	weighted avg 0.85 0.80 0.80 19616



	Confusion matrix:
	[[9397 3536]
	[ 467 6216]]


	Accuracy score: 0.7959318923327896