Created
November 17, 2011 07:01
-
-
Save anonymous/1372557 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from time import time | |
| import numpy as np | |
| from sklearn.datasets import load_files | |
| from sklearn.feature_extraction.text import Vectorizer | |
| from sklearn.preprocessing import Normalizer | |
| from sklearn.feature_selection import SelectKBest, chi2 | |
| from sklearn.linear_model import RidgeClassifier | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.naive_bayes import MultinomialNB | |
| from sklearn.svm.sparse import LinearSVC | |
| from sklearn import metrics | |
| from sklearn.cross_validation import KFold | |
| # Load categories | |
| categories = ['comp.sys.mac.hardware', 'rec.sport.baseball', 'talk.politics.misc'] | |
| # Load data | |
| print "Loading privacy policy dataset for categories:" | |
| print categories if categories else "all" | |
| data_set = load_files('20Newsgroup/raw', categories = categories, | |
| shuffle = True, random_state = 42) | |
| print 'data loaded' | |
| # Extract features | |
| print "Extracting features from the training dataset using a sparse vectorizer" | |
| t0 = time() | |
| vectorizer = Vectorizer(max_features=10000) | |
| X = vectorizer.fit_transform(data_set.data) | |
| X = Normalizer(norm="l2", copy=False).transform(X) | |
| y = data_set.target | |
| # feature selection | |
| ch2 = SelectKBest(chi2, k = 3000) | |
| X = ch2.fit_transform(X, y) | |
| X = X.toarray() | |
| # with .toarray(), results: f1:0.99634, precision 0.99637 | |
| # only X (sparse), results: f1:0.99524, precision 0.99526 | |
| # All other classifiers (kNN, NB, etc) have consistant results no matter toarray() or not. | |
| print type(X) | |
| n_samples, n_features = X.shape | |
| print "done in %fs" % (time() - t0) | |
| print "n_samples: %d, n_features: %d" % (n_samples, n_features) | |
| # Setup 10 fold cross validation | |
| num_fold = 10 | |
| kf = KFold(n_samples, k=num_fold, indices=True) | |
| # clf = MultinomialNB(alpha=.01) | |
| # clf = KNeighborsClassifier(n_neighbors=13) | |
| clf = RidgeClassifier(tol=1e-1) | |
| # clf = LinearSVC(loss='l2', penalty='l1', C=1000, dual=False, tol=1e-3) | |
| # Initialize variables for couting the average | |
| f1_all = 0.0 | |
| pre_all = 0.0 | |
| # Test for 10 rounds using the results from 10 fold cross validations | |
| for train_index, test_index in kf: | |
| X_train, X_test = X[train_index], X[test_index] | |
| y_train, y_test = y[train_index], y[test_index] | |
| clf.fit(X_train, y_train) | |
| train_time = time() - t0 | |
| pred = clf.predict(X_test) | |
| test_time = time() - t0 | |
| # metrics | |
| f1_score = metrics.f1_score(y_test, pred) | |
| pre_score = metrics.precision_score(y_test, pred) | |
| f1_all += f1_score | |
| pre_all += pre_score | |
| # print data_set.target_names | |
| # print metrics.classification_report(y_test, pred) | |
| # print metrics.confusion_matrix(y_test, pred) | |
| f1_all = f1_all/num_fold | |
| pre_all = pre_all/num_fold | |
| print clf | |
| print "average f1-score: %0.5f" % f1_all | |
| print "average precision: %0.5f" % pre_all |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment