Skip to content

Instantly share code, notes, and snippets.

@yuriybash
Created February 1, 2019 00:59
Show Gist options
  • Save yuriybash/c4d5542ae9e283e183ac015961ab9936 to your computer and use it in GitHub Desktop.
Save yuriybash/c4d5542ae9e283e183ac015961ab9936 to your computer and use it in GitHub Desktop.
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib inline
# other
from datetime import datetime
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE
from scipy import sparse
def prep_data(in_file):
data_df = pd.read_csv(in_file)
title_vectorizer = TfidfVectorizer(max_features=500)
title_vectorizer.fit(data_df.title)
X_title = title_vectorizer.transform(data_df.title).toarray()
url_vectorizer = TfidfVectorizer(max_features=500)
url_vectorizer.fit(data_df.url)
X_url = url_vectorizer.transform(data_df.url).toarray()
X = np.concatenate([X_title, X_url], axis=1)
Y = data_df['noneng'].values
return X, Y
def display_scores(scores):
print("Scores: ", scores)
print("Mean: ", scores.mean())
print("STD Dev: ", scores.std())
X_hand, Y_hand = prep_data('/Users/yuriy/learning/machine_learning/non_hacker_news_ml/data/data.csv')
X_autolabeled, Y_autolabeled = prep_data('/Users/yuriy/learning/machine_learning/non_hacker_news_ml/data/labeled_autolabeled_mix.csv')
#
# X_hand_adjusted, Y_hand_adjusted = SMOTE().fit_resample(X_hand, Y_hand)
# X_autolabeled_adjusted, Y_autolabeled_adjusted = SMOTE().fit_resample(X_autolabeled, Y_autolabeled)
#<------INCLUDES_SYNTHETIC-------->
# hand_mnb_scores = cross_val_score(MultinomialNB(alpha=0.20), X_hand_adjusted, Y_hand_adjusted, scoring='precision', cv=10)
# autolabeled_mb_scores = cross_val_score(MultinomialNB(alpha=0.20), X_autolabeled_adjusted, Y_autolabeled_adjusted, scoring='precision', cv=10)
#
#
# print "hand: "
# display_scores(hand_mnb_scores) # ('Mean: ', 0.8533958725633479)
#
# print "autolabeled: "
# display_scores(autolabeled_mb_scores) # ('Mean: ', 0.943126558681957)
hand_clf = MultinomialNB(alpha=20.0)
hand_clf.fit(X_hand, Y_hand)
auto_clf = MultinomialNB(alpha=20.0)
auto_clf.fit(X_autolabeled, Y_autolabeled)
# hand_mnb_scores = cross_val_score(hand_clf, X_hand, Y_hand, scoring='accuracy', cv=10)
# autolabeled_mb_scores = cross_val_score(auto_clf, X_hand, Y_hand, scoring='accuracy', cv=10)
#
#
# print "hand: "
# display_scores(hand_mnb_scores) #
#
# print "autolabeled: "
# display_scores(autolabeled_mb_scores) #
#
hand_mnb_predict = cross_val_predict(hand_clf, X_hand, Y_hand, cv=10)
autolabeled_mb_predict = cross_val_predict(auto_clf, X_hand, Y_hand, cv=10)
# ^ same exact values, even though they're two trained models. i think cross_val_predict is refitting on that same data set
# and yet:
ipdb> hand_clf.predict(X_hand).sum()
211
ipdb> auto_clf.predict(X_hand).sum()
2
ipdb>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment