Skip to content

Instantly share code, notes, and snippets.

@albrzykowski
Created August 24, 2018 00:58
Show Gist options
  • Save albrzykowski/25d45204a890f242f2cad5862e9b2a2e to your computer and use it in GitHub Desktop.
Save albrzykowski/25d45204a890f242f2cad5862e9b2a2e to your computer and use it in GitHub Desktop.
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.externals import joblib
from sklearn.grid_search import GridSearchCV
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
import re
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from nltk.corpus import stopwords
from sklearn.base import TransformerMixin, BaseEstimator
from string import punctuation
import csv
import datetime
from pprint import pprint
from time import time
import logging
#stopword list to use
spanish_stopwords = stopwords.words('english')
#spanish stemmer
stemmer = SnowballStemmer('english')
#punctuation to remove
non_words = list(punctuation)
#we add spanish punctuation
non_words.extend(map(str,range(10)))
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
# remove punctuation
text = ''.join([c for c in text if c not in non_words])
text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
# tokenize
tokens = word_tokenize(text)
# stem
try:
stems = stem_tokens(tokens, stemmer)
except Exception as e:
print(e)
print(text)
stems = ['']
return stems
print('Reading data...')
tweets = []
labels = []
with open('small.csv', 'rb') as csvfile:
spamreader = csv.reader(csvfile, delimiter='|')
for row in spamreader:
if len(row) !=2:
continue
if row[0].upper() == "POSITIVE" or row[0] == "NEGATIVE" or row[0] == "NEUTRAL":
tweets.append(row[1])
if row[0].upper() == "NEGATIVE":
labels.append([-1])
if row[0].upper() == "NEUTRAL":
labels.append([0])
if row[0].upper() == "POSITIVE":
labels.append([1])
target_names = ['Negative', 'Neutral', 'Positive']
pipeline = Pipeline([
('vect', CountVectorizer(
analyzer = 'word',
tokenizer = tokenize,
lowercase = True,
stop_words = spanish_stopwords,
min_df = 2,
max_df = 0.5,
# ngram_range=(1, 2)
max_features=200000
)),
('tfidf', TfidfTransformer(
# use_idf = True,
norm = 'l2'
)),
('clf', OneVsRestClassifier(LinearSVC()))])
mlb_labels = MultiLabelBinarizer().fit_transform(labels)
teaching_corpus_end = 70000 #307337
testing_corpus_start = teaching_corpus_end + 1
parameters = {
'vect__min_df': (2, 10, 50),
'vect__max_df': (0.5, 0.75),
'vect__max_features': (200000, 500000)
'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
'tfidf__use_idf': (True, False),
'tfidf__norm': ('l1', 'l2')
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=100, verbose=10000000)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(tweets[:teaching_corpus_end], MultiLabelBinarizer().fit_transform(labels[:teaching_corpus_end]))
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment