Created
August 24, 2018 00:58
-
-
Save albrzykowski/25d45204a890f242f2cad5862e9b2a2e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.pipeline import Pipeline | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.svm import LinearSVC | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.multiclass import OneVsRestClassifier | |
from sklearn.preprocessing import MultiLabelBinarizer | |
from sklearn.externals import joblib | |
from sklearn.grid_search import GridSearchCV | |
from nltk.stem import SnowballStemmer | |
from nltk.corpus import stopwords | |
from nltk import word_tokenize | |
import re | |
from sklearn.metrics import classification_report | |
from sklearn.model_selection import cross_val_score | |
from nltk.corpus import stopwords | |
from sklearn.base import TransformerMixin, BaseEstimator | |
from string import punctuation | |
import csv | |
import datetime | |
from pprint import pprint | |
from time import time | |
import logging | |
#stopword list to use | |
spanish_stopwords = stopwords.words('english') | |
#spanish stemmer | |
stemmer = SnowballStemmer('english') | |
#punctuation to remove | |
non_words = list(punctuation) | |
#we add spanish punctuation | |
non_words.extend(map(str,range(10))) | |
def stem_tokens(tokens, stemmer): | |
stemmed = [] | |
for item in tokens: | |
stemmed.append(stemmer.stem(item)) | |
return stemmed | |
def tokenize(text): | |
# remove punctuation | |
text = ''.join([c for c in text if c not in non_words]) | |
text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) | |
# tokenize | |
tokens = word_tokenize(text) | |
# stem | |
try: | |
stems = stem_tokens(tokens, stemmer) | |
except Exception as e: | |
print(e) | |
print(text) | |
stems = [''] | |
return stems | |
print('Reading data...') | |
tweets = [] | |
labels = [] | |
with open('small.csv', 'rb') as csvfile: | |
spamreader = csv.reader(csvfile, delimiter='|') | |
for row in spamreader: | |
if len(row) !=2: | |
continue | |
if row[0].upper() == "POSITIVE" or row[0] == "NEGATIVE" or row[0] == "NEUTRAL": | |
tweets.append(row[1]) | |
if row[0].upper() == "NEGATIVE": | |
labels.append([-1]) | |
if row[0].upper() == "NEUTRAL": | |
labels.append([0]) | |
if row[0].upper() == "POSITIVE": | |
labels.append([1]) | |
target_names = ['Negative', 'Neutral', 'Positive'] | |
pipeline = Pipeline([ | |
('vect', CountVectorizer( | |
analyzer = 'word', | |
tokenizer = tokenize, | |
lowercase = True, | |
stop_words = spanish_stopwords, | |
min_df = 2, | |
max_df = 0.5, | |
# ngram_range=(1, 2) | |
max_features=200000 | |
)), | |
('tfidf', TfidfTransformer( | |
# use_idf = True, | |
norm = 'l2' | |
)), | |
('clf', OneVsRestClassifier(LinearSVC()))]) | |
mlb_labels = MultiLabelBinarizer().fit_transform(labels) | |
teaching_corpus_end = 70000 #307337 | |
testing_corpus_start = teaching_corpus_end + 1 | |
parameters = { | |
'vect__min_df': (2, 10, 50), | |
'vect__max_df': (0.5, 0.75), | |
'vect__max_features': (200000, 500000) | |
'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams | |
'tfidf__use_idf': (True, False), | |
'tfidf__norm': ('l1', 'l2') | |
} | |
grid_search = GridSearchCV(pipeline, parameters, n_jobs=100, verbose=10000000) | |
print("Performing grid search...") | |
print("pipeline:", [name for name, _ in pipeline.steps]) | |
print("parameters:") | |
pprint(parameters) | |
t0 = time() | |
grid_search.fit(tweets[:teaching_corpus_end], MultiLabelBinarizer().fit_transform(labels[:teaching_corpus_end])) | |
print("done in %0.3fs" % (time() - t0)) | |
print() | |
print("Best score: %0.3f" % grid_search.best_score_) | |
print("Best parameters set:") | |
best_parameters = grid_search.best_estimator_.get_params() | |
for param_name in sorted(parameters.keys()): | |
print("\t%s: %r" % (param_name, best_parameters[param_name])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment