albrzykowski/sentiment_analysis.py

## sentiment_analysis.py
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.externals import joblib
from sklearn.grid_search import GridSearchCV
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
import re
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from nltk.corpus import stopwords
from sklearn.base import TransformerMixin, BaseEstimator
from string import punctuation
import csv
import datetime
from pprint import pprint
from time import time
import logging

#stopword list to use
spanish_stopwords = stopwords.words('english')

#spanish stemmer
stemmer = SnowballStemmer('english')

#punctuation to remove
non_words = list(punctuation)
#we add spanish punctuation
non_words.extend(map(str,range(10)))

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove punctuation
    text = ''.join([c for c in text if c not in non_words])
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

print('Reading data...')

tweets = []
labels = []
with open('small.csv', 'rb') as csvfile:
    spamreader = csv.reader(csvfile, delimiter='|')
    for row in spamreader:
        if len(row) !=2:
          continue
        if row[0].upper() == "POSITIVE" or row[0] == "NEGATIVE" or row[0] == "NEUTRAL":
            tweets.append(row[1])
            if row[0].upper() == "NEGATIVE":
                labels.append([-1])
            if row[0].upper() == "NEUTRAL":
                labels.append([0])
            if row[0].upper() == "POSITIVE":
                labels.append([1])

target_names = ['Negative', 'Neutral', 'Positive']

pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 2,
            max_df = 0.5,
            # ngram_range=(1, 2)
            max_features=200000
            )),
    ('tfidf', TfidfTransformer(
            # use_idf = True,
            norm = 'l2'
            )),
    ('clf', OneVsRestClassifier(LinearSVC()))])

mlb_labels = MultiLabelBinarizer().fit_transform(labels)

teaching_corpus_end = 70000 #307337
testing_corpus_start = teaching_corpus_end + 1


parameters = {
    'vect__min_df': (2, 10, 50),
    'vect__max_df': (0.5, 0.75),
    'vect__max_features': (200000, 500000)
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2')
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=100, verbose=10000000)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(tweets[:teaching_corpus_end], MultiLabelBinarizer().fit_transform(labels[:teaching_corpus_end]))
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
	import numpy as np
	from sklearn.pipeline import Pipeline
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.svm import LinearSVC
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.preprocessing import MultiLabelBinarizer
	from sklearn.externals import joblib
	from sklearn.grid_search import GridSearchCV
	from nltk.stem import SnowballStemmer
	from nltk.corpus import stopwords
	from nltk import word_tokenize
	import re
	from sklearn.metrics import classification_report
	from sklearn.model_selection import cross_val_score
	from nltk.corpus import stopwords
	from sklearn.base import TransformerMixin, BaseEstimator
	from string import punctuation
	import csv
	import datetime
	from pprint import pprint
	from time import time
	import logging

	#stopword list to use
	spanish_stopwords = stopwords.words('english')

	#spanish stemmer
	stemmer = SnowballStemmer('english')

	#punctuation to remove
	non_words = list(punctuation)
	#we add spanish punctuation
	non_words.extend(map(str,range(10)))

	def stem_tokens(tokens, stemmer):
	stemmed = []
	for item in tokens:
	stemmed.append(stemmer.stem(item))
	return stemmed

	def tokenize(text):
	# remove punctuation
	text = ''.join([c for c in text if c not in non_words])
	text = re.sub(r'^https?:\/\/.[\r\n]', '', text, flags=re.MULTILINE)
	# tokenize
	tokens = word_tokenize(text)

	# stem
	try:
	stems = stem_tokens(tokens, stemmer)
	except Exception as e:
	print(e)
	print(text)
	stems = ['']
	return stems

	print('Reading data...')

	tweets = []
	labels = []
	with open('small.csv', 'rb') as csvfile:
	spamreader = csv.reader(csvfile, delimiter='\|')
	for row in spamreader:
	if len(row) !=2:
	continue
	if row[0].upper() == "POSITIVE" or row[0] == "NEGATIVE" or row[0] == "NEUTRAL":
	tweets.append(row[1])
	if row[0].upper() == "NEGATIVE":
	labels.append([-1])
	if row[0].upper() == "NEUTRAL":
	labels.append([0])
	if row[0].upper() == "POSITIVE":
	labels.append([1])

	target_names = ['Negative', 'Neutral', 'Positive']

	pipeline = Pipeline([
	('vect', CountVectorizer(
	analyzer = 'word',
	tokenizer = tokenize,
	lowercase = True,
	stop_words = spanish_stopwords,
	min_df = 2,
	max_df = 0.5,
	# ngram_range=(1, 2)
	max_features=200000
	)),
	('tfidf', TfidfTransformer(
	# use_idf = True,
	norm = 'l2'
	)),
	('clf', OneVsRestClassifier(LinearSVC()))])

	mlb_labels = MultiLabelBinarizer().fit_transform(labels)

	teaching_corpus_end = 70000 #307337
	testing_corpus_start = teaching_corpus_end + 1


	parameters = {
	'vect__min_df': (2, 10, 50),
	'vect__max_df': (0.5, 0.75),
	'vect__max_features': (200000, 500000)
	'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
	'tfidf__use_idf': (True, False),
	'tfidf__norm': ('l1', 'l2')
	}

	grid_search = GridSearchCV(pipeline, parameters, n_jobs=100, verbose=10000000)

	print("Performing grid search...")
	print("pipeline:", [name for name, _ in pipeline.steps])
	print("parameters:")
	pprint(parameters)
	t0 = time()
	grid_search.fit(tweets[:teaching_corpus_end], MultiLabelBinarizer().fit_transform(labels[:teaching_corpus_end]))
	print("done in %0.3fs" % (time() - t0))
	print()

	print("Best score: %0.3f" % grid_search.best_score_)
	print("Best parameters set:")
	best_parameters = grid_search.best_estimator_.get_params()
	for param_name in sorted(parameters.keys()):
	print("\t%s: %r" % (param_name, best_parameters[param_name]))