Skip to content

Instantly share code, notes, and snippets.

Created November 17, 2011 07:01
Show Gist options
  • Select an option

  • Save anonymous/1372557 to your computer and use it in GitHub Desktop.

Select an option

Save anonymous/1372557 to your computer and use it in GitHub Desktop.
from time import time
import numpy as np
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import Vectorizer
from sklearn.preprocessing import Normalizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm.sparse import LinearSVC
from sklearn import metrics
from sklearn.cross_validation import KFold
# Load categories
categories = ['comp.sys.mac.hardware', 'rec.sport.baseball', 'talk.politics.misc']
# Load data
print "Loading privacy policy dataset for categories:"
print categories if categories else "all"
data_set = load_files('20Newsgroup/raw', categories = categories,
shuffle = True, random_state = 42)
print 'data loaded'
print
# Extract features
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform(data_set.data)
X = Normalizer(norm="l2", copy=False).transform(X)
y = data_set.target
# feature selection
ch2 = SelectKBest(chi2, k = 3000)
X = ch2.fit_transform(X, y)
X = X.toarray()
# with .toarray(), results: f1:0.99634, precision 0.99637
# only X (sparse), results: f1:0.99524, precision 0.99526
# All other classifiers (kNN, NB, etc) have consistant results no matter toarray() or not.
print type(X)
n_samples, n_features = X.shape
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % (n_samples, n_features)
print
# Setup 10 fold cross validation
num_fold = 10
kf = KFold(n_samples, k=num_fold, indices=True)
# clf = MultinomialNB(alpha=.01)
# clf = KNeighborsClassifier(n_neighbors=13)
clf = RidgeClassifier(tol=1e-1)
# clf = LinearSVC(loss='l2', penalty='l1', C=1000, dual=False, tol=1e-3)
# Initialize variables for couting the average
f1_all = 0.0
pre_all = 0.0
# Test for 10 rounds using the results from 10 fold cross validations
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf.fit(X_train, y_train)
train_time = time() - t0
pred = clf.predict(X_test)
test_time = time() - t0
# metrics
f1_score = metrics.f1_score(y_test, pred)
pre_score = metrics.precision_score(y_test, pred)
f1_all += f1_score
pre_all += pre_score
# print data_set.target_names
# print metrics.classification_report(y_test, pred)
# print metrics.confusion_matrix(y_test, pred)
f1_all = f1_all/num_fold
pre_all = pre_all/num_fold
print
print clf
print "average f1-score: %0.5f" % f1_all
print "average precision: %0.5f" % pre_all
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment