Skip to content

Instantly share code, notes, and snippets.

Created June 14, 2015 22:16
Show Gist options
  • Save Andrew62/86959d6549f12f97cc2f to your computer and use it in GitHub Desktop.
Save Andrew62/86959d6549f12f97cc2f to your computer and use it in GitHub Desktop.
Walkthrough on Kaggle
for more information.
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
def review_to_words(raw_review):
review_text = BeautifulSoup(raw_review).get_text()
letters_only = re.sub("[^a-zA-z]", " ", review_text)
words = letters_only.lower().split()
stops = set(stopwords.words('english'))
meaningful_words = [w for w in words if w not in stops]
return " ".join(meaningful_words)
def clean_all_reviews(reviews):
print "Cleaing all reviews...\n"
cleaned_reviews = []
total = reviews.shape[0]
for i in xrange(0, total):
return cleaned_reviews
def vectorize_words(words):
print "Vectorizing words...\n"
vectorizer = CountVectorizer(analyzer="word", max_features = 5000)
data_features = vectorizer.fit_transform(words)
return vectorizer, data_features.toarray()
def randon_forest_sentiment(train_data_feats, sentiment):
print "Training random forest classifier...\n"
forest = RandomForestClassifier(n_estimators=100, n_jobs=-1), sentiment)
return forest
def validate(reviews, sentiment, n_validations=5):
print "Cross validating...\n"
scores = []
average = lambda x : sum(x)/float(len(x))
number = 0.0
for train_index, test_index in StratifiedKFold(sentiment, n_folds=n_validations, shuffle=True):
feat_train, feat_test = reviews[train_index], reviews[test_index]
sent_train, sent_test = sentiment[train_index], sentiment[test_index]
forest = randon_forest_sentiment(feat_train, sent_train)
predicted = forest.predict(feat_test)
scores.append(accuracy_score(sent_test, predicted))
number += 1
print "{0}%".format(round(number/n_validations*100, 2))
print "Completed cross validation.\nAccuracy: {0}".format(round(average(scores),2))
if __name__ == "__main__":
start = time.time()
in_file = "labeledTrainData.tsv/labeledTrainData.tsv"
train = pd.read_csv(in_file, header=0, quoting=3, delimiter='\t')
cleaned_reviews = clean_all_reviews(
_, vectorized_reviews = vectorize_words(cleaned_reviews)
validate(vectorized_reviews, train.sentiment)
print "Completed in {0} seconds.".format(round(time.time() - start, 2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment