Skip to content

Instantly share code, notes, and snippets.

@alep
Created September 8, 2014 15:37
Show Gist options
  • Save alep/49594a86255afaad9866 to your computer and use it in GitHub Desktop.
Save alep/49594a86255afaad9866 to your computer and use it in GitHub Desktop.
# coding: utf-8
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
from sklearn import metrics
datafile = open("data.tsv")
data = sp.genfromtxt(datafile, delimiter="\t", dtype='<i8,object', names='category,url')
vec = CountVectorizer(analyzer='char', ngram_range=(4, 4))
spliter = cross_validation.StratifiedShuffleSplit(data['category'], n_iter=1, train_size=0.1)
train_index, test_index = list(spliter)[0]
X_train = instances = data['url'][train_index]
y_train = labels = data['category'][train_index]
X_test = data['url'][test_index]
y_test = data['category'][test_index]
print len(X_train)
print len(X_test)
X_train_vectorized = vec.fit_transform(X_train, y_train)
X_test_vectorized = vec.transform(X_test)
nb_clf = BernoulliNB()
nb_clf.fit(X_train_vectorized, labels)
predicted_nb = clf.predict(X_test_vectorized)
print metrics.classification_report(y_text, predicted_nb)
# This takes some time...
lr_clf = LogisticRegression()
lr_clf.fit(X_train_vectorized, y_train)
predicted_lr = lr_clf.predict(X_test_counts)
print metrics.classification_report(y_test, predicted_lr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment