Created
February 26, 2013 09:53
-
-
Save mikaelbr/5037374 to your computer and use it in GitHub Desktop.
A rudimentary experiment file for gridsearching on multinominal naive bayes classification.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
import numpy as np | |
import logging | |
from os import path | |
from sklearn import decomposition, linear_model | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.pipeline import Pipeline | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.svm import LinearSVC | |
from sklearn.datasets import load_files | |
from sklearn.cross_validation import train_test_split, StratifiedKFold | |
from sklearn.grid_search import GridSearchCV | |
from sklearn import metrics | |
import filters as f | |
import preprocess as p | |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s') | |
def add_filters (text): | |
text = f.no_url(text) | |
# text = f.username_placeholder(text) | |
text = f.no_usernames(text) | |
# text = f.no_emoticons(text) | |
text = f.no_hash(text) | |
# text = f.no_rt_tag(text) | |
text = f.reduce_letter_duplicates(text) | |
# text = p.remove_stopwords(text, ['not']) | |
text = p.negation_attachment(text) | |
return text | |
train_set_filename = (sys.argv[1] if len(sys.argv) > 1 else False) or 'data/train/output_tweets.tsv' | |
test_set_filename = (sys.argv[2] if len(sys.argv) > 2 else False) or 'data/test/test_output_tweets.tsv' | |
if not path.exists(train_set_filename) or not path.exists(test_set_filename): | |
raise Exception("File not found") | |
my_data = np.loadtxt(train_set_filename, delimiter='\t', dtype='S', comments=None) | |
my_test_data = np.loadtxt(test_set_filename, delimiter='\t', dtype='S', comments=None) | |
# Split the dataset in training and test set: | |
docs_train, docs_test, y_train, y_test = train_test_split( | |
my_data[:,4], my_data[:,3], test_size=0.5, random_state=0) | |
# vect = CountVectorizer(charset_error='ignore') | |
pipeline = Pipeline([ | |
('vect', CountVectorizer(charset_error='ignore')), | |
('tfidf', TfidfTransformer()), | |
('clf', MultinomialNB()), | |
]) | |
grid = GridSearchCV( | |
pipeline, | |
{ | |
'vect__ngram_range': ((1, 1), (2, 2), (3,3)), | |
'vect__stop_words': ('english', None), | |
'vect__preprocessor' (func1, func2, func3 ....), | |
'tfidf__use_idf': (True, False), | |
'clf__alpha': tuple( np.arange(0.1, 1.0, 0.1) ), | |
}, | |
cv=StratifiedKFold(y_train, n_folds=5), | |
refit=True, | |
n_jobs=-1 | |
) | |
print "1. Training " | |
grid.fit(docs_train, y_train) | |
print "2. Done training" | |
print grid.best_params_ | |
y_predicted = grid.best_estimator_.predict(docs_test) | |
# print metrics.classification_report(y_test, y_predicted, target_names=['"positive"', '"negative"', '"neutral"', '"objective"']) | |
# Plot the confusion matrix | |
print metrics.confusion_matrix(y_test, y_predicted) | |
print "Best score: %s" % grid.best_score_ | |
# # Predict the result on some short new sentences: | |
predicted = grid.best_estimator_.predict(my_test_data[:,4]) | |
# print predicted[:4] | |
# print my_test_data[:,3] | |
print "Calculated acc: %s" % np.mean(predicted == my_test_data[:,3]) | |
print "Accuracy Score: %.2f" % grid.score(my_test_data[:,4], my_test_data[:,3]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment