mikaelbr/mnb_gridsearch_test.py

## mnb_gridsearch_test.py
#!/usr/bin/python
import sys
import numpy as np
import logging

from os import path
from sklearn import decomposition, linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn import metrics

import filters as f
import preprocess as p


logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s')


def add_filters (text):
  text = f.no_url(text)
  # text = f.username_placeholder(text)
  text = f.no_usernames(text)
  # text = f.no_emoticons(text)
  text = f.no_hash(text)
  # text = f.no_rt_tag(text)
  text = f.reduce_letter_duplicates(text)
  # text = p.remove_stopwords(text, ['not'])
  text = p.negation_attachment(text)

  return text

train_set_filename = (sys.argv[1] if len(sys.argv) > 1 else False) or 'data/train/output_tweets.tsv'
test_set_filename = (sys.argv[2] if len(sys.argv) > 2 else False) or 'data/test/test_output_tweets.tsv'

if not path.exists(train_set_filename) or not path.exists(test_set_filename):
  raise Exception("File not found")

my_data = np.loadtxt(train_set_filename, delimiter='\t', dtype='S', comments=None)
my_test_data = np.loadtxt(test_set_filename, delimiter='\t', dtype='S', comments=None)


# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    my_data[:,4], my_data[:,3], test_size=0.5, random_state=0)


# vect = CountVectorizer(charset_error='ignore')
pipeline = Pipeline([
    ('vect', CountVectorizer(charset_error='ignore')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

grid = GridSearchCV(
          pipeline,
          {
            'vect__ngram_range': ((1, 1), (2, 2), (3,3)),
            'vect__stop_words': ('english', None),
            'vect__preprocessor' (func1, func2, func3 ....),
            'tfidf__use_idf': (True, False),
            'clf__alpha': tuple( np.arange(0.1, 1.0, 0.1) ),
          },
          cv=StratifiedKFold(y_train, n_folds=5),
          refit=True,
          n_jobs=-1
        )

print "1. Training "
grid.fit(docs_train, y_train)
print "2. Done training"

print grid.best_params_

y_predicted = grid.best_estimator_.predict(docs_test)

# print metrics.classification_report(y_test, y_predicted, target_names=['"positive"', '"negative"', '"neutral"', '"objective"'])

# Plot the confusion matrix
print metrics.confusion_matrix(y_test, y_predicted)

print "Best score: %s" % grid.best_score_

# # Predict the result on some short new sentences:
predicted = grid.best_estimator_.predict(my_test_data[:,4])
# print predicted[:4]
# print my_test_data[:,3]
print "Calculated acc: %s" % np.mean(predicted == my_test_data[:,3])
print "Accuracy Score: %.2f" % grid.score(my_test_data[:,4], my_test_data[:,3])
	#!/usr/bin/python
	import sys
	import numpy as np
	import logging

	from os import path
	from sklearn import decomposition, linear_model
	from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.pipeline import Pipeline

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.svm import LinearSVC
	from sklearn.datasets import load_files
	from sklearn.cross_validation import train_test_split, StratifiedKFold
	from sklearn.grid_search import GridSearchCV
	from sklearn import metrics

	import filters as f
	import preprocess as p


	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s')


	def add_filters (text):
	text = f.no_url(text)
	# text = f.username_placeholder(text)
	text = f.no_usernames(text)
	# text = f.no_emoticons(text)
	text = f.no_hash(text)
	# text = f.no_rt_tag(text)
	text = f.reduce_letter_duplicates(text)
	# text = p.remove_stopwords(text, ['not'])
	text = p.negation_attachment(text)

	return text

	train_set_filename = (sys.argv[1] if len(sys.argv) > 1 else False) or 'data/train/output_tweets.tsv'
	test_set_filename = (sys.argv[2] if len(sys.argv) > 2 else False) or 'data/test/test_output_tweets.tsv'

	if not path.exists(train_set_filename) or not path.exists(test_set_filename):
	raise Exception("File not found")

	my_data = np.loadtxt(train_set_filename, delimiter='\t', dtype='S', comments=None)
	my_test_data = np.loadtxt(test_set_filename, delimiter='\t', dtype='S', comments=None)


	# Split the dataset in training and test set:
	docs_train, docs_test, y_train, y_test = train_test_split(
	my_data[:,4], my_data[:,3], test_size=0.5, random_state=0)


	# vect = CountVectorizer(charset_error='ignore')
	pipeline = Pipeline([
	('vect', CountVectorizer(charset_error='ignore')),
	('tfidf', TfidfTransformer()),
	('clf', MultinomialNB()),
	])

	grid = GridSearchCV(
	pipeline,
	{
	'vect__ngram_range': ((1, 1), (2, 2), (3,3)),
	'vect__stop_words': ('english', None),
	'vect__preprocessor' (func1, func2, func3 ....),
	'tfidf__use_idf': (True, False),
	'clf__alpha': tuple( np.arange(0.1, 1.0, 0.1) ),
	},
	cv=StratifiedKFold(y_train, n_folds=5),
	refit=True,
	n_jobs=-1
	)

	print "1. Training "
	grid.fit(docs_train, y_train)
	print "2. Done training"

	print grid.best_params_

	y_predicted = grid.best_estimator_.predict(docs_test)

	# print metrics.classification_report(y_test, y_predicted, target_names=['"positive"', '"negative"', '"neutral"', '"objective"'])

	# Plot the confusion matrix
	print metrics.confusion_matrix(y_test, y_predicted)

	print "Best score: %s" % grid.best_score_

	# # Predict the result on some short new sentences:
	predicted = grid.best_estimator_.predict(my_test_data[:,4])
	# print predicted[:4]
	# print my_test_data[:,3]
	print "Calculated acc: %s" % np.mean(predicted == my_test_data[:,3])
	print "Accuracy Score: %.2f" % grid.score(my_test_data[:,4], my_test_data[:,3])