Skip to content

Instantly share code, notes, and snippets.

@dylan-lawrence
Created December 10, 2015 03:42
Show Gist options
  • Save dylan-lawrence/305459dacc14ed2926d6 to your computer and use it in GitHub Desktop.
Save dylan-lawrence/305459dacc14ed2926d6 to your computer and use it in GitHub Desktop.
#pipeline version of sci.py for better memory managment
import sys
import numpy as np
from tools.striper import stripe, cleanupfiles
from tools.tweetprocessor import clean, wordclean
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import NuSVC, SVC, SVR
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn import tree
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
stripe(0.05)
#vectorizer = HashingVectorizer()
vectorizer = TfidfVectorizer(ngram_range=(1,3))
#vectorizer = CountVectorizer(ngram_range=(1,3))
classifier = BernoulliNB()
#classifier = NuSVC(decision_function_shape='ovr')
#classifier = SVC()
#classifier = RandomForestClassifier()
#classifier = tree.DecisionTreeClassifier()
#classifier = AdaBoostClassifier()
#classifier = BaggingClassifier()
#classifier = SGDClassifier(loss='epsilon_insensitive',n_iter=5000, penalty='none', random_state=int(sys.argv[1]))
#seeds, 184 -> 74 |
#no custom processing
tweets = []
labels = []
with open('tempdata/goodtraining.txt','r') as f:
for line in f:
tweets.append(clean(line.rstrip('\n')))
labels.append('good')
with open('tempdata/badtraining.txt','r') as f:
for line in f:
tweets.append(clean(line.rstrip('\n')))
labels.append('bad')
#vect = vectorizer.fit_transform(trigrams)
#classifier.fit(vect, np.array(['good']*goodtotal + ['bad']*badtotal))
vect = vectorizer.fit_transform(tweets)
classifier.fit(vect, labels)
print()
print('Running tests')
#run tests
test = []
testlabel = []
with open('tempdata/goodtest.txt','r') as f:
for line in f:
#test.append(line.rstrip('\n'))
test.append(clean(line.rstrip('\n')))
testlabel.append('good')
with open('tempdata/badtest.txt','r') as f:
for line in f:
#test.append(line.rstrip('\n'))
test.append(clean(line.rstrip('\n')))
testlabel.append('bad')
#clean test
test2 = []
for tweet in test:
tweet = tweet.split()
tweet = [wordclean(x) for x in tweet]
test2.append(' '.join(tweet))
test = list(test2)
vect = vectorizer.transform(test)
print('Predicting')
predictions = classifier.predict(vect)
print (classification_report(testlabel, predictions))
cleanupfiles()
print ('Classifying')
target = []
with open('data/eu-test-dist.txt','r') as f:
for line in f:
line = line.split('\t')
if len(line) > 1:
target.append(clean(line[1].rstrip('\n')))
#clean target
target2 = []
for tweet in target:
tweet = tweet.split()
tweet = [wordclean(x) for x in tweet]
target2.append(' '.join(tweet))
t = list(target)
target = list(target2)
targetvect = vectorizer.transform(target)
predicts = classifier.predict(targetvect)
with open('out.txt','w') as f:
f.write('TweetId,Sentiment\n')
i = 1
for p in predicts:
if p == 'good':
f.write(str(i) + ',P\n')
else:
f.write(str(i) + ',N\n')
i+=1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment