Created
February 1, 2019 00:59
-
-
Save yuriybash/c4d5542ae9e283e183ac015961ab9936 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# data analysis and wrangling | |
import pandas as pd | |
import numpy as np | |
import random as rnd | |
# visualization | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
# %matplotlib inline | |
# other | |
from datetime import datetime | |
# machine learning | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.svm import SVC, LinearSVC | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve | |
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.linear_model import Perceptron | |
from sklearn.linear_model import SGDClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.utils import shuffle | |
from imblearn.over_sampling import SMOTE | |
from scipy import sparse | |
def prep_data(in_file): | |
data_df = pd.read_csv(in_file) | |
title_vectorizer = TfidfVectorizer(max_features=500) | |
title_vectorizer.fit(data_df.title) | |
X_title = title_vectorizer.transform(data_df.title).toarray() | |
url_vectorizer = TfidfVectorizer(max_features=500) | |
url_vectorizer.fit(data_df.url) | |
X_url = url_vectorizer.transform(data_df.url).toarray() | |
X = np.concatenate([X_title, X_url], axis=1) | |
Y = data_df['noneng'].values | |
return X, Y | |
def display_scores(scores): | |
print("Scores: ", scores) | |
print("Mean: ", scores.mean()) | |
print("STD Dev: ", scores.std()) | |
X_hand, Y_hand = prep_data('/Users/yuriy/learning/machine_learning/non_hacker_news_ml/data/data.csv') | |
X_autolabeled, Y_autolabeled = prep_data('/Users/yuriy/learning/machine_learning/non_hacker_news_ml/data/labeled_autolabeled_mix.csv') | |
# | |
# X_hand_adjusted, Y_hand_adjusted = SMOTE().fit_resample(X_hand, Y_hand) | |
# X_autolabeled_adjusted, Y_autolabeled_adjusted = SMOTE().fit_resample(X_autolabeled, Y_autolabeled) | |
#<------INCLUDES_SYNTHETIC--------> | |
# hand_mnb_scores = cross_val_score(MultinomialNB(alpha=0.20), X_hand_adjusted, Y_hand_adjusted, scoring='precision', cv=10) | |
# autolabeled_mb_scores = cross_val_score(MultinomialNB(alpha=0.20), X_autolabeled_adjusted, Y_autolabeled_adjusted, scoring='precision', cv=10) | |
# | |
# | |
# print "hand: " | |
# display_scores(hand_mnb_scores) # ('Mean: ', 0.8533958725633479) | |
# | |
# print "autolabeled: " | |
# display_scores(autolabeled_mb_scores) # ('Mean: ', 0.943126558681957) | |
hand_clf = MultinomialNB(alpha=20.0) | |
hand_clf.fit(X_hand, Y_hand) | |
auto_clf = MultinomialNB(alpha=20.0) | |
auto_clf.fit(X_autolabeled, Y_autolabeled) | |
# hand_mnb_scores = cross_val_score(hand_clf, X_hand, Y_hand, scoring='accuracy', cv=10) | |
# autolabeled_mb_scores = cross_val_score(auto_clf, X_hand, Y_hand, scoring='accuracy', cv=10) | |
# | |
# | |
# print "hand: " | |
# display_scores(hand_mnb_scores) # | |
# | |
# print "autolabeled: " | |
# display_scores(autolabeled_mb_scores) # | |
# | |
hand_mnb_predict = cross_val_predict(hand_clf, X_hand, Y_hand, cv=10) | |
autolabeled_mb_predict = cross_val_predict(auto_clf, X_hand, Y_hand, cv=10) | |
# ^ same exact values, even though they're two trained models. i think cross_val_predict is refitting on that same data set | |
# and yet: | |
ipdb> hand_clf.predict(X_hand).sum() | |
211 | |
ipdb> auto_clf.predict(X_hand).sum() | |
2 | |
ipdb> | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment