Skip to content

Instantly share code, notes, and snippets.

@mikaelbr
Created February 26, 2013 09:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mikaelbr/5037374 to your computer and use it in GitHub Desktop.
Save mikaelbr/5037374 to your computer and use it in GitHub Desktop.
A rudimentary experiment file for gridsearching on multinominal naive bayes classification.
#!/usr/bin/python
import sys
import numpy as np
import logging
from os import path
from sklearn import decomposition, linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
import filters as f
import preprocess as p
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s')
def add_filters (text):
text = f.no_url(text)
# text = f.username_placeholder(text)
text = f.no_usernames(text)
# text = f.no_emoticons(text)
text = f.no_hash(text)
# text = f.no_rt_tag(text)
text = f.reduce_letter_duplicates(text)
# text = p.remove_stopwords(text, ['not'])
text = p.negation_attachment(text)
return text
train_set_filename = (sys.argv[1] if len(sys.argv) > 1 else False) or 'data/train/output_tweets.tsv'
test_set_filename = (sys.argv[2] if len(sys.argv) > 2 else False) or 'data/test/test_output_tweets.tsv'
if not path.exists(train_set_filename) or not path.exists(test_set_filename):
raise Exception("File not found")
my_data = np.loadtxt(train_set_filename, delimiter='\t', dtype='S', comments=None)
my_test_data = np.loadtxt(test_set_filename, delimiter='\t', dtype='S', comments=None)
# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
my_data[:,4], my_data[:,3], test_size=0.5, random_state=0)
# vect = CountVectorizer(charset_error='ignore')
pipeline = Pipeline([
('vect', CountVectorizer(charset_error='ignore')),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])
grid = GridSearchCV(
pipeline,
{
'vect__ngram_range': ((1, 1), (2, 2), (3,3)),
'vect__stop_words': ('english', None),
'vect__preprocessor' (func1, func2, func3 ....),
'tfidf__use_idf': (True, False),
'clf__alpha': tuple( np.arange(0.1, 1.0, 0.1) ),
},
cv=StratifiedKFold(y_train, n_folds=5),
refit=True,
n_jobs=-1
)
print "1. Training "
grid.fit(docs_train, y_train)
print "2. Done training"
print grid.best_params_
y_predicted = grid.best_estimator_.predict(docs_test)
# print metrics.classification_report(y_test, y_predicted, target_names=['"positive"', '"negative"', '"neutral"', '"objective"'])
# Plot the confusion matrix
print metrics.confusion_matrix(y_test, y_predicted)
print "Best score: %s" % grid.best_score_
# # Predict the result on some short new sentences:
predicted = grid.best_estimator_.predict(my_test_data[:,4])
# print predicted[:4]
# print my_test_data[:,3]
print "Calculated acc: %s" % np.mean(predicted == my_test_data[:,3])
print "Accuracy Score: %.2f" % grid.score(my_test_data[:,4], my_test_data[:,3])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment